# Module 2 - Explore Data

Libraries have pre-defined code for other functions that are not included in basic Python. Once a library has been imported, any of its functions can be used throughout the entire notebook.

In [1]:
#import libraries
import pandas as pd
import numpy as np

## Load data

### csv files

Stands for "comma separated values"; it is a plain text file where each value is separated by some delimiter (usually commas but can be tabs, semicolons, spaces, etc.)

In [2]:
#load csv file data with headers

location = "datasets/smallgradesh.csv"
df = pd.read_csv(location)

In [3]:
df.head()
#df.head?

Unnamed: 0,Name,Grade
0,Marcia,82.4
1,Kadeem,78.2
2,Nash,79.3
3,Noelani,83.2
4,Noelani,87.4


In [5]:
df.tail(3)

Unnamed: 0,Name,Grade
1997,Mercedes,84.9
1998,Lucius,69.1
1999,Linus,79.6


In [6]:
#load data without headers

location2 = "datasets/smallgrades.csv"
df_nohead = pd.read_csv(location2, header=None) #try w/o header=None
df_nohead.head()

Unnamed: 0,0,1
0,Marcia,82.4
1,Kadeem,78.2
2,Nash,79.3
3,Noelani,83.2
4,Noelani,87.4


In [7]:
#add headers during data load

df_during = pd.read_csv(location2, names=['Name', 'Grade'])
df_during.head()

Unnamed: 0,Name,Grade
0,Marcia,82.4
1,Kadeem,78.2
2,Nash,79.3
3,Noelani,83.2
4,Noelani,87.4


In [8]:
#add headers after data load

df_nohead.columns = ['Name', 'Grade']
df_nohead.head()

Unnamed: 0,Name,Grade
0,Marcia,82.4
1,Kadeem,78.2
2,Nash,79.3
3,Noelani,83.2
4,Noelani,87.4


In [9]:
#create data

names = ['Bob','Jessica','Mary','John','Mel']
grades = [76,95,77,78,99]
GradeList = list(zip(names,grades))

In [10]:
#export csv files

df = pd.DataFrame(data = GradeList, columns=['Names','Grades'])
df.to_csv('studentgrades.csv',index=False,header=False)

### Excel files

In [11]:
GradeList

[('Bob', 76), ('Jessica', 95), ('Mary', 77), ('John', 78), ('Mel', 99)]

In [12]:
#import Excel file

location = "datasets/gradedata.xlsx"
df = pd.read_excel(location) #overwrites the info from the df variable in the examples above
df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
0,Marcia,Pugh,female,17,3,10,82.4,"7379 Highland Rd. , Dublin, GA 31021"
1,Kadeem,Morrison,male,18,4,4,78.2,"8 Bayport St. , Honolulu, HI 96815"
2,Nash,Powell,male,18,5,9,79.3,"Encino, CA 91316, 3 Lilac Street"
3,Noelani,Wagner,female,14,2,7,83.2,"Riverview, FL 33569, 9998 North Smith Dr."
4,Noelani,Cherry,female,18,4,15,87.4,"97 SE. Ocean Street , Bethlehem, PA 18015"


In [13]:
#save dataframe as Excel file

#using GradeList from above
names = ['Bob','Jessica','Mary','John','Mel']
grades = [76,95,77,78,99]
GradeList = list(zip(names,grades))

df = pd.DataFrame(data = GradeList, columns=['Names','Grades'])
writer = pd.ExcelWriter('dataframe.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()

In [15]:
#multiple sheets

df = pd.DataFrame(data = GradeList, columns=['Names','Grades'])
writer = pd.ExcelWriter('dataframe.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet3')
df.to_excel(writer, sheet_name='Sheet6')
writer.save()

### Exploratory Analysis

In [16]:
#load gradedata.csv file

location = "datasets/gradedata.csv"
df = pd.read_csv(location)

df.head()

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
0,Marcia,Pugh,female,17,3,10,82.4,"9253 Richardson Road, Matawan, NJ 07747"
1,Kadeem,Morrison,male,18,4,4,78.2,"33 Spring Dr., Taunton, MA 02780"
2,Nash,Powell,male,18,5,9,79.3,"41 Hill Avenue, Mentor, OH 44060"
3,Noelani,Wagner,female,14,2,7,83.2,"8839 Marshall St., Miami, FL 33125"
4,Noelani,Cherry,female,18,4,15,87.4,"8304 Charles Rd., Lewis Center, OH 43035"


In [17]:
#show the number of rows and columns in a dataframe
df.shape

(2000, 8)

In [18]:
#show the column names in the dataset
df.columns
#or print(df.columns)

Index(['fname', 'lname', 'gender', 'age', 'exercise', 'hours', 'grade',
       'address'],
      dtype='object')

In [19]:
#show the data type of each column
df.dtypes

fname        object
lname        object
gender       object
age           int64
exercise      int64
hours         int64
grade       float64
address      object
dtype: object

In [20]:
#number of non-NA values
#len(df) would count rows including NA values
df.count()

fname       2000
lname       2000
gender      2000
age         2000
exercise    2000
hours       2000
grade       2000
address     2000
dtype: int64

In [21]:
df['hours'].max()

20

In [22]:
df['hours'].min()

0

In [23]:
df['hours'].sum()

21977

In [24]:
df['hours'].mean()

10.9885

In [25]:
df['hours'].median()

11.0

In [26]:
df['hours'].mode()

0    11
dtype: int64

In [27]:
df['hours'].value_counts()

11    196
10    188
12    179
9     156
14    155
13    152
8     143
15    123
7     110
6     106
16     93
5      83
17     81
18     61
4      58
3      40
19     34
20     22
2      15
0       3
1       2
Name: hours, dtype: int64

In [28]:
#standard deviation
df['hours'].std()

4.063942120993027

In [29]:
#descriptive statistics
df.describe()

Unnamed: 0,age,exercise,hours,grade
count,2000.0,2000.0,2000.0,2000.0
mean,16.5785,3.0005,10.9885,82.55605
std,1.696254,1.423205,4.063942,9.747593
min,14.0,0.0,0.0,32.0
25%,15.0,2.0,8.0,75.575
50%,17.0,3.0,11.0,82.7
75%,18.0,4.0,14.0,89.7
max,19.0,5.0,20.0,100.0


In [30]:
#doesn't do anything on its own
df.groupby('gender')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000014C00FDA588>

In [31]:
#add a stats or math function to it
df['hours'].groupby(df['gender']).mean()

#mean of multiple columns
#df[['hours', 'exercise']].groupby(df['gender']).mean()

gender
female    10.932
male      11.045
Name: hours, dtype: float64

In [32]:
#use two columns to groupby
df.groupby(['gender', 'age']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,exercise,hours,grade
gender,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,14,3.132911,10.658228,82.058228
female,15,2.943038,11.050633,82.905696
female,16,3.132911,11.158228,82.582278
female,17,3.146893,10.943503,83.599435
female,18,2.978022,10.950549,82.5
female,19,2.952096,10.832335,82.592216
male,14,2.822785,11.170886,82.323418
male,15,3.021127,10.866197,82.671831
male,16,2.994318,11.465909,82.752841
male,17,2.927374,11.268156,82.949721


In [33]:
#pivot table default function is mean
pd.pivot_table(df, index=['gender'])

Unnamed: 0_level_0,age,exercise,grade,hours
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,16.568,3.047,82.7173,10.932
male,16.589,2.954,82.3948,11.045


In [34]:
pd.pivot_table(df, values=['hours'], index=['gender'])

Unnamed: 0_level_0,hours
gender,Unnamed: 1_level_1
female,10.932
male,11.045


In [35]:
#unique values in a column
df['age'].unique()

array([17, 18, 14, 16, 15, 19], dtype=int64)

In [36]:
#find missing values
filename = "datasets/gradedatamissing.csv"
df_missing = pd.read_csv(filename)

#df.head()

In [37]:
#total missing values
df_missing.isnull().sum()

fname       0
lname       0
gender      2
age         7
exercise    8
hours       8
grade       2
address     0
dtype: int64

In [38]:
#show rows with missing values
missing = df_missing['exercise'].isnull()
#missing will only show True/False values
df_missing.loc[missing]

Unnamed: 0,fname,lname,gender,age,exercise,hours,grade,address
592,Mark,Hankerson,male,16.0,,17.0,92.1,"3351 Hibiscus St. Miami, Fl 33133"
673,Jeremy,Rowe,male,19.0,,10.0,83.4,"6 Madison St., Fall River, MA 02720"
683,Kalia,Moss,female,,,,,"760 East Stillwater St., Rosedale, NY 11422"
699,Gretchen,Haynes,female,,,,77.2,"500 Border St., New Lenox, IL 60451"
700,Ruth,Bowman,female,,,,100.0,"8621 Shub Farm Ave., Ocean Springs, MS 39564"
701,Eric,Walter,male,,,,93.2,"416 Glendale Ave., Green Cove Springs, FL 32043"
884,Dara,Giles,female,15.0,,17.0,88.4,"164 Shore Street, Wadsworth, OH 44281"
981,Fitzgerald,Barry,male,15.0,,10.0,69.6,"836 N. Miles St., Oak Park, MI 48237"


## Finding duplicate rows

In [39]:
names = ['Jessica','John','Bob','Jessica','Mary','John','Mel','Mel']
grades = [95,78,76,95,77,78,99,100]
GradeList = list(zip(names,grades))
df = pd.DataFrame(data = GradeList, columns=['Names', 'Grades'])
df

Unnamed: 0,Names,Grades
0,Jessica,95
1,John,78
2,Bob,76
3,Jessica,95
4,Mary,77
5,John,78
6,Mel,99
7,Mel,100


In [40]:
#boolean values for if there's another row with the exact values in each column
dupe = df.duplicated()
#duplicate of Jessica, 95; John, 78
#returns false on first instance of duplicate row

In [41]:
df.loc[dupe]

Unnamed: 0,Names,Grades
3,Jessica,95
5,John,78


## Choosing Rows

In [42]:
colA = [10, 20, 40, 50]
colB = ['no', 'yes', 'yes', 'no']

A_B = list(zip(colA, colB))

df_A = pd.DataFrame(data=A_B, columns=['A', 'B'])
df_A

Unnamed: 0,A,B
0,10,no
1,20,yes
2,40,yes
3,50,no


In [43]:
index = ['a', 'b', 'c', 'd']
colA = [10, 20, 40, 50]
colB = ['no', 'yes', 'yes', 'no']

df_B = pd.DataFrame(data=A_B, columns=['A', 'B'], index=index)

df_B

Unnamed: 0,A,B
a,10,no
b,20,yes
c,40,yes
d,50,no


In [44]:
#loc: label based indexing
df_A.loc[0]

A    10
B    no
Name: 0, dtype: object

In [45]:
#iloc: select row by index(position) number
df_A.iloc[0]

A    10
B    no
Name: 0, dtype: object

In [46]:
#loc: label based indexing
df_B.loc['b']

A     20
B    yes
Name: b, dtype: object

In [47]:
df_B.iloc[1]

A     20
B    yes
Name: b, dtype: object

In [48]:
#let's see how it gets tricky
df_C = df_A.copy()
df_C

Unnamed: 0,A,B
0,10,no
1,20,yes
2,40,yes
3,50,no


In [49]:
index = [2,1,0,3]
colA = [10, 20, 40, 50]
colB = ['no', 'yes', 'yes', 'no']

df_D = pd.DataFrame(data=A_B, columns=['A', 'B'], index=index)
df_D

Unnamed: 0,A,B
2,10,no
1,20,yes
0,40,yes
3,50,no


In [50]:
#works like python slicing; give me index 0 through the index before 3
df_C.iloc[0:3]

Unnamed: 0,A,B
0,10,no
1,20,yes
2,40,yes


In [51]:
#give me the index label '0' and everything in between and include the label '3'
df_C.loc[0:3]

Unnamed: 0,A,B
0,10,no
1,20,yes
2,40,yes
3,50,no


In [52]:
#gave index position 0 through the index position before 3
df_D.iloc[0:3]

Unnamed: 0,A,B
2,10,no
1,20,yes
0,40,yes


In [53]:
#gave the label '0' and all the rows that are in between and ending with the row with label '3'
df_D.loc[0:3]

Unnamed: 0,A,B
0,40,yes
3,50,no


# Bonus Code!

Below are some code snippets for advanced tasks

## Load multiple data files

In [54]:
import glob

all_data = pd.DataFrame()
for f in glob.glob("datasets/data*.xlsx"):
    df = pd.read_excel(f)
    all_data = all_data.append(df, ignore_index=True)
    
all_data.count() #each data file had 100 rows

fname    300
age      300
grade    300
dtype: int64

## Create random data

In [None]:
from numpy import random

names = ['Bob','Jessica','Mary','John','Mel']

random.seed(500)

random_names = [names[random.randint(low=0,high=len(names))] 
 				for i in range(1000)]

births = [random.randint(low=0,high=1000) 
 		  for i in range(1000)]

BabyDataSet = list(zip(random_names,births))
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])

df.head()