Install libraries 

In [None]:
import pandas as pd
from scipy import stats
import matplotlib as mpl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Import data from git url

In [None]:
url='https://raw.githubusercontent.com/gagost/dataworkshop/cf6e1a6af9c4cfef8af0e963ee4d4ef6aaa07aef/30March_Data_Odontometric_csv.csv'
df = pd.read_csv(url, encoding= 'unicode_escape')

Some housekeeping stuff

In [None]:
#Update pandas parameters 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

You can use python as an overpowered calculator if you want to

In [None]:
print(8675300+9)

It's easy to get python to print back comments to you

In [None]:
print("I love coding!")

In [None]:
print("This is the bee's knees")

There are a few ways to make python "ignore" parts of code. Super useful if you want to write notes, make contingencies, or if you are still working on the best way to code

In [None]:
print("Go")

# is good for silencing short comments

print("Little")

"""
But for long, multi-line comments you want to go with the triple quotes
"""

print("Rockstar")

In [None]:
#Objects and object storage are essential parts of any code-based stats program

peptalk=print("You've got this")
peptalk

In [None]:
list=['I','Love','Python']
list

Run some quick descriptive statistics to ensure the data imported correctly and perform basic data exploration

In [None]:
#Shows the first 10 rows of the data frame
df.head(10)

Let's get a quick count of the each variable. I'll give you the code for a loop that'll run through the whole spreadsheet. "False" tells us how many observations we have.

In [None]:
missing_data=df.isnull()
result=[]
for column in missing_data.columns.values.tolist():
    na=missing_data[column].value_counts()
    result.append(na)
print(result)
df_na=pd.DataFrame(result)
nas=df_na.reset_index()

In [None]:
#Gives summary statistics (n, mean, standard deviations, range) per variable
df.describe()

Let's make some histograms! 

In [None]:
plt.hist(df[['Bi_UCMD_mm']])

In [None]:
plt.hist(df[['Bi_UI1BL_mm']],color='red')

In [None]:
plt.hist(df[['Bi_UM1MLDB_mm']],color='red',edgecolor='yellow',linewidth=1)

In [None]:
plt.hist(df[['Bi_UM1MLDB_mm']],color='red',edgecolor='yellow',linewidth=3)

# Now make some histos of your own in the cells below! Shout out if you find any abnormal distributions so we can show the class.

In [None]:
#Here's a list of the variables to make it easier for you. Copy and paste your way to victory!
print(df.columns.values)

# Cool. Now let's make some pretty pictures. We'll start with categorical variables.

In [None]:
#Boxplots

#Lets subset the Incisal MD values
Incisors=df[["Bi_UI1MD_mm","Bi_UI2MD_mm","Bi_LI1MD_mm","Bi_LI2MD_mm"]]

#Drop NA
Incisors.dropna(axis=0,how='any',thresh=None,subset=None,inplace=True)

#How can we make sure the subsetting worked? 
#put your answer here
Incisors.head(10)

#Time for pretty pictures
plt.boxplot(Incisors, notch='True', patch_artist=True, labels=['Upper_Central','Upper_Lateral','Lower_Central','Lower_Lateral'])

In [None]:
#Lets say we want to look at the %Completion value for incisors only by Ancestry

df['Incisors_Comp%']=((df[["Bi_UI1MD_mm","Bi_UI2MD_mm","Bi_LI1MD_mm","Bi_LI2MD_mm"]].count(axis=1))/4)*100
df.head(5)

sns.catplot("Ancestry", col="Incisors_Comp%", hue="Sex", col_wrap=5, data=df, kind="count", height=5.5, aspect=0.9, palette='BuPu')
#Find hue palettes here: https://medium.com/@morganjonesartist/color-guide-to-seaborn-palettes-da849406d44f

In [None]:
#Cool, let's do it by age! Let's do the number of people by ancestry
sns.catplot("Ancestry", col="Age_Group", col_wrap=3, data=df, kind="count", height=5.5, aspect=0.9, palette='tab20')

In [None]:
#Cool, let's dissect this a little bit more. Let's do a violin plot so we can use continuous age! Let's also add in sex and region
sns.catplot(x="Age_Clean_yr", y="Region", hue="Sex", col="Ancestry", data=df, orient="h", kind="violin", height=5.5, aspect=1, palette='bwr_r', dodge=True, bw=.2)

# Now let's focus on relational plots with continuous data

In [None]:
#Scatter plot
plt.scatter(df['Bi_UI1MD_mm'], df['Bi_UI1BL_mm'])

In [None]:
#Lets combine histos and scatters!
sns.jointplot(data=df, x='Bi_UI1MD_mm', y='Bi_UI1BL_mm', kind="reg")

In [None]:
#Regression plot with line, Upper first molar BL and MD measurements
regplot = sns.lmplot(x="Bi_UM1BL_mm", y="Bi_UM1MD_mm", hue="Sex", data=df, height=7, aspect=1.6, robust=True, palette='cubehelix_r', scatter_kws=dict(s=60, linewidths=.7, edgecolors='black'))
#Note: if you are working with ordinal data, you might find it useful to do a jitterplot instead

In [None]:
#Regression plot with line, Lower first molar BL and MD measurements
regplot = sns.lmplot(x="Bi_LM1BL_mm", y="Bi_LM1MD_mm", hue="Sex", data=df, height=7, aspect=1.6, robust=True, palette='cubehelix_r', scatter_kws=dict(s=60, linewidths=.7, edgecolors='black'))

In [None]:
#Size is always an issue with biological data. We don't want to be biased by size, so lets make some indices and plot those instead

#make an index of the BL and MD dimension of the first upper molar
df['M1U_index']=df['Bi_UM1BL_mm']/df['Bi_UM1MD_mm']
df['M1L_index']=df['Bi_LM1BL_mm']/df['Bi_LM1MD_mm']

regplot = sns.lmplot(x="M1U_index", y="M1L_index", hue="Sex", data=df, height=7, aspect=1.6, robust=True, palette='cubehelix_r', scatter_kws=dict(s=60, linewidths=.7, edgecolors='black'))

In [None]:
#Regression plot with partitioned data
#1 subset data
df_fem=df[df.Sex=="F"]
df_m=df[df.Sex=="M"]

#Make regression plot on subsetted data -- females
regplot = sns.lmplot(x="Bi_UM2MD_mm", y="Bi_UM1MD_mm", hue="Ancestry", data=df_fem, height=7, aspect=1.6, robust=True, palette='tab10', scatter_kws=dict(s=60, linewidths=.7, edgecolors='black'))
plt.title('Females')

#Make regression plot on subsetted data -- females
regplot = sns.lmplot(x="Bi_UM2MD_mm", y="Bi_UM1MD_mm", hue="Ancestry", data=df_m, height=7, aspect=1.6, robust=True, palette='tab10', scatter_kws=dict(s=60, linewidths=.7, edgecolors='black'))
plt.title('Males')

In [None]:
#Make regression plot for age, sex
regplot = sns.lmplot(x="Age_Clean_yr", y="Completion_%", hue="Sex", data=df, height=7, aspect=1.6, robust=True, palette='cool', scatter_kws=dict(s=60, linewidths=.7, edgecolors='black'))

In [None]:
#Make regression plot for age, ancestry
regplot = sns.lmplot(x="Age_Clean_yr", y="Completion_%", hue="Ancestry", data=df, height=7, aspect=1.6, robust=True, palette='tab10', scatter_kws=dict(s=60, linewidths=.7, edgecolors='black'))

Explore some of the relationships between variables that you're interested in