# <font face = 'Impact' color = '#FFAEBC' > Loading, Organizing, and Storing Data One <font/>
#### <font face = 'Times New Roman' color = '#B5E5CF'> License: GPL v3.0 <font/>
#### <font face = 'Times New Roman' color = '#B5E5CF'> Author and Trainer: Paolo Hilado MSc. (Data Science)<font/>
In this section, we focus on importing, structuring, and saving data using Python. We'll use the pandas library along with base Python functions to load data from common file formats such as CSV and Excel. Once the data is loaded into pandas DataFrames, we’ll explore basic techniques to organize and inspect the data—such as viewing column names, checking data types, and handling missing values. Finally, we’ll learn how to store the cleaned and structured data back into files for future use.

In [3]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import researchpy as rp
import os

In [4]:
# Checking out the current working directory
print(os.getcwd())

C:\Users\Sheila Magolama\pythonTraining\day 1 data


In [5]:
#Loading a spreadsheet and assigning it to a variable
#We will open and work on "ExperimentOne.xlsx"
df = pd.read_excel("ExperimentTwo.xlsx")

In [6]:
#Viewing the dataframe
df

Unnamed: 0,ID,Pre,Post,Group
0,1,2.03,17.23,Cntrl
1,2,4.02,16.04,Cntrl
2,3,14.34,19.22,Cntrl
3,4,15.55,19.45,Cntrl
4,5,2.05,18.53,Cntrl
...,...,...,...,...
70,71,8.34,6.86,Tx
71,72,7.52,17.63,Tx
72,73,5.21,18.84,Tx
73,74,3.73,15.11,Tx


In [7]:
# Viewing the first rows
df.head(10)

Unnamed: 0,ID,Pre,Post,Group
0,1,2.03,17.23,Cntrl
1,2,4.02,16.04,Cntrl
2,3,14.34,19.22,Cntrl
3,4,15.55,19.45,Cntrl
4,5,2.05,18.53,Cntrl
5,6,11.07,21.0,Cntrl
6,7,11.03,19.28,Cntrl
7,8,1.37,19.63,Cntrl
8,9,6.02,12.66,Cntrl
9,10,15.75,17.55,Cntrl


In [8]:
# Viewing the last rows
df.tail(10)

Unnamed: 0,ID,Pre,Post,Group
65,66,6.43,12.75,Tx
66,67,5.46,13.89,Tx
67,68,11.69,16.01,Tx
68,69,17.32,12.55,Tx
69,70,8.95,19.67,Tx
70,71,8.34,6.86,Tx
71,72,7.52,17.63,Tx
72,73,5.21,18.84,Tx
73,74,3.73,15.11,Tx
74,75,12.38,19.07,Tx


In [9]:
# Viewing all rows
pd.set_option('display.max_rows', None)

In [12]:
df

Unnamed: 0,ID,Pre,Post,Group
0,1,2.03,17.23,Cntrl
1,2,4.02,16.04,Cntrl
2,3,14.34,19.22,Cntrl
3,4,15.55,19.45,Cntrl
4,5,2.05,18.53,Cntrl
...,...,...,...,...
70,71,8.34,6.86,Tx
71,72,7.52,17.63,Tx
72,73,5.21,18.84,Tx
73,74,3.73,15.11,Tx


In [11]:
# to reset this
pd.reset_option('display.max_rows')

In [14]:
#Slicing the dataframe
df["Post"] # Slicing column approach 1
#df.Post # Slicing a column approach 2

0     17.23
1     16.04
2     19.22
3     19.45
4     18.53
      ...  
70     6.86
71    17.63
72    18.84
73    15.11
74    19.07
Name: Post, Length: 75, dtype: float64

In [15]:
df.Pre

0      2.03
1      4.02
2     14.34
3     15.55
4      2.05
      ...  
70     8.34
71     7.52
72     5.21
73     3.73
74    12.38
Name: Pre, Length: 75, dtype: float64

In [16]:
# All rows of first column
subset = df.iloc[:,0]
subset

0      1
1      2
2      3
3      4
4      5
      ..
70    71
71    72
72    73
73    74
74    75
Name: ID, Length: 75, dtype: int64

In [17]:
# Presenting 49 rows of 1st and 3rd Column
newsub = df.iloc[:49, [0,2]]
newsub

Unnamed: 0,ID,Post
0,1,17.23
1,2,16.04
2,3,19.22
3,4,19.45
4,5,18.53
5,6,21.0
6,7,19.28
7,8,19.63
8,9,12.66
9,10,17.55


In [18]:
#get the pre and post measures for rows with ID 1-20
newsub = df.iloc[:20, [1,2]]
newsub

Unnamed: 0,Pre,Post
0,2.03,17.23
1,4.02,16.04
2,14.34,19.22
3,15.55,19.45
4,2.05,18.53
5,11.07,21.0
6,11.03,19.28
7,1.37,19.63
8,6.02,12.66
9,15.75,17.55


In [19]:
df.iloc[:20, :3]

Unnamed: 0,ID,Pre,Post
0,1,2.03,17.23
1,2,4.02,16.04
2,3,14.34,19.22
3,4,15.55,19.45
4,5,2.05,18.53
5,6,11.07,21.0
6,7,11.03,19.28
7,8,1.37,19.63
8,9,6.02,12.66
9,10,15.75,17.55


In [21]:
# Presenting rows 10 to 19 of first column
eg = df.iloc[10:19, 1]
eg

10     5.34
11     1.67
12     5.72
13    16.59
14    16.01
15    18.66
16    18.68
17    18.86
18    14.24
Name: Pre, dtype: float64

In [20]:
# Presenting rows 5,6,50,57 for all columns
nsub = df.iloc[[5,6,50,57], :]
nsub

Unnamed: 0,ID,Pre,Post,Group
5,6,11.07,21.0,Cntrl
6,7,11.03,19.28,Cntrl
50,51,0.37,18.58,Tx
57,58,6.3,18.25,Tx


In [23]:
# Presenting rows 10 to 19 of first column
eg = df.iloc[10:20, 0]
eg

10    11
11    12
12    13
13    14
14    15
15    16
16    17
17    18
18    19
19    20
Name: ID, dtype: int64

In [24]:
#Checking the structure of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      75 non-null     int64  
 1   Pre     74 non-null     float64
 2   Post    74 non-null     float64
 3   Group   74 non-null     object 
dtypes: float64(2), int64(1), object(1)
memory usage: 2.5+ KB


In [25]:
#Checking for NAs or missing cases for the data set
df.isnull().values.any()

np.True_

In [26]:
# Check for NAs per column
df.isnull().any()

ID       False
Pre       True
Post      True
Group     True
dtype: bool

In [27]:
#Getting a simple descriptives
df.iloc[:, 1].describe() #works with post test measures
# desc = df["Post"].describe()

count    74.000000
mean     10.564865
std       5.501477
min       0.370000
25%       5.747500
50%      11.050000
75%      15.185000
max      19.190000
Name: Pre, dtype: float64

In [28]:
df.Post.describe

<bound method NDFrame.describe of 0     17.23
1     16.04
2     19.22
3     19.45
4     18.53
      ...  
70     6.86
71    17.63
72    18.84
73    15.11
74    19.07
Name: Post, Length: 75, dtype: float64>

In [29]:
# Descriptive for categorical variable
ndesc = df["Group"].describe()
# df["Group"].unique()
ndesc

count        74
unique        2
top       Cntrl
freq         37
Name: Group, dtype: object

In [30]:
# Descriptives when grouped by setup
descPre = np.round(rp.summary_cont(df["Pre"].groupby(df["Group"])),2)
# descPre2 = rp.summary_cont(complete1.Pre.groupby(complete1.Group))
descPost = np.round(rp.summary_cont(df["Post"].groupby(df["Group"])),2)
# descPre2 = rp.summary_cont(complete1.Post.groupby(complete1.Group))







In [31]:
descPre

Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cntrl,37,11.21,5.76,0.95,9.29,13.14
Tx,37,9.92,5.22,0.86,8.17,11.66


In [32]:
descPost

Unnamed: 0_level_0,N,Mean,SD,SE,95% Conf.,Interval
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cntrl,37,17.23,3.52,0.58,16.05,18.4
Tx,37,16.62,3.17,0.52,15.56,17.68


In [33]:
import qdesc as qd

In [34]:
qd.desc(df)

Unnamed: 0,count,mean,std,median,MAD,min,max,AD_stat,5% crit_value,1% crit_value
ID,75.0,38.0,21.79,38.0,19.0,1.0,75.0,0.81,0.75,1.04
Pre,74.0,10.56,5.5,11.05,,0.37,19.19,,0.75,1.04
Post,74.0,16.92,3.34,18.2,,5.28,21.04,,0.75,1.04
