# Data Cleansing for Angela Duckworth's "Grit" Dataset

In [51]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

## Load the data and remove missing values

In [52]:
df = pd.read_csv("../duckworth-grit-scale-data/csvData.csv")

In [53]:
df.head()

Unnamed: 0,country,surveyelapse,GS1,GS2,GS3,GS4,GS5,GS6,GS7,GS8,...,O7,O8,O9,O10,operatingsystem,browser,screenw,screenh,introelapse,testelapse
0,RO,174.0,1.0,1.0,3.0,3.0,3.0,2.0,3.0,1.0,...,5.0,4.0,5.0,4.0,Windows,Chrome,1366.0,768.0,69590.0,307.0
1,US,120.0,2.0,2.0,3.0,3.0,2.0,1.0,3.0,3.0,...,4.0,3.0,4.0,5.0,Macintosh,Chrome,1280.0,800.0,33657.0,134.0
2,US,99.0,3.0,3.0,3.0,3.0,4.0,3.0,4.0,4.0,...,5.0,5.0,4.0,4.0,Windows,Firefox,1920.0,1080.0,95550.0,138.0
3,KE,5098.0,1.0,3.0,4.0,2.0,4.0,1.0,5.0,4.0,...,4.0,2.0,5.0,4.0,Windows,Chrome,1600.0,900.0,4.0,4440.0
4,JP,340.0,1.0,2.0,3.0,3.0,2.0,2.0,2.0,4.0,...,4.0,1.0,3.0,2.0,Windows,Firefox,1920.0,1080.0,3.0,337.0


In [54]:
from random import shuffle
#n = len(full) #number of rows in your dataset
#indices = range(n)
#shuffle(indices)
#print "train indices:", indices[:3300]
#print "test indices:", indices[3300:]

#df = full.loc[indices[:3300]]
#df = full

In [55]:
df = df.dropna()

In [56]:
len(df)

4226

## Translate Grit and Big Five personality triats 

In [57]:
df['grit'] = (df['GS2'] + df['GS3'] + df['GS5'] + df['GS7'] + df['GS8'] + df['GS11'] +
             (6-df['GS1']) + (6-df['GS4']) + (6-df['GS6']) + (6-df['GS9']) + (6-df['GS10']) + (6-df['GS12']))/12

In [58]:
df['grit'].head()

0    3.000000
1    3.416667
2    3.250000
3    4.333333
4    3.083333
Name: grit, dtype: float64

In [59]:
df['extroversion'] = df['E1'] + (6-df['E2']) + df['E3'] + (6-df['E4']) + df['E5'] + (6-df['E6']) + \
                    df['E7'] + (6-df['E8']) + df['E9'] + (6-df['E10'])

In [60]:
df['extroversion'].head()

0    31.0
1    40.0
2    18.0
3    19.0
4    12.0
Name: extroversion, dtype: float64

In [61]:
df['neuroticism'] = df['N1'] + (6-df['N2']) + df['N3'] + (6-df['N4']) + df['N5'] + df['N6'] + \
                    df['N7'] + df['N8'] + df['N9'] + df['N10']

In [62]:
df['agreeableness'] = (6-df['A1']) + df['A2'] + (6-df['A3']) + df['A4'] + (6-df['A5']) + df['A6'] + \
                      (6-df['A7']) + df['A8'] + df['A9'] + df['A10']

In [63]:
df['conscientiousness'] = df['C1'] + (6-df['C2']) + df['C3'] + (6-df['C4']) + df['C5'] + (6-df['C6']) + \
                          df['C7'] + (6-df['C8']) + df['C9'] + df['C10']

In [64]:
df['openness'] = df['O1'] + (6-df['O2']) + df['O3'] + (6-df['O4']) + df['O5'] + (6-df['O6']) + \
                 df['O7'] + (6-df['O8']) + df['O9'] + df['O10']#5?10?

## Create dummy variables and update their names

In [65]:
df = df.join(pd.get_dummies(df['operatingsystem'],prefix='os'))

In [66]:
df = df.join(pd.get_dummies(df['browser'],prefix='browser'))

In [67]:
df = df.join(pd.get_dummies(df['country'],prefix='country'))

In [68]:
df = df[df['urban'] > .5]
df = df.join(pd.get_dummies(df['urban'],prefix='area'))
df = df.rename(index=str, columns={"area_1.0": "area_rural", 
                              "area_2.0": "area_suburban",
                              "area_3.0": "area_urban"})

In [69]:
df = df.join(pd.get_dummies(df['hand'],prefix='hand'))
df = df.rename(index=str, columns={"hand_1.0": "hand_right", 
                              "hand_2.0": "hand_left",
                              "hand_3.0": "hand_both"})

In [70]:
df = df[df['religion'] > .5]
df = df.join(pd.get_dummies(df['religion'],prefix='religion'))
df = df.rename(index=str, columns={"religion_1.0": "religion_agnostic", 
                              "religion_2.0": "religion_atheist",
                              "religion_3.0": "religion_buddhist",
                              "religion_4.0": "religion_catholic",
                              "religion_5.0": "religion_mormon",
                              "religion_6.0": "religion_protestant",
                              "religion_7.0": "religion_christianother",
                              "religion_8.0": "religion_hindu",
                              "religion_9.0": "religion_jewis",
                              "religion_10.0": "religion_muslim",
                              "religion_11.0": "religion_sikh",
                              "religion_12.0": "religion_other"})

In [71]:
df = df[df['orientation'] > .5]
df = df.join(pd.get_dummies(df['orientation'],prefix='orientation'))
df = df.rename(index=str, columns={"orientation_1.0": "orientation_heterosexual", 
                              "orientation_2.0": "orientation_bisexual",
                              "orientation_3.0": "orientation_homosexual",
                              "orientation_4.0": "orientation_asexual",
                              "orientation_5.0": "orientation_other"})

In [72]:
df = df[df['race'] > .5]
df = df.join(pd.get_dummies(df['race'],prefix='race'))
df = df.rename(index=str, columns={"race_1.0": "race_asian", 
                              "race_2.0": "race_arab",
                              "race_3.0": "race_black",
                              "race_4.0": "race_white",
                              "race_5.0": "race_other"})

In [73]:
df = df[df['married'] > .5]
df = df.join(pd.get_dummies(df['married'],prefix='married'))
df = df.rename(index=str, columns={"married_1.0": "married_never", 
                              "married_2.0": "married_currently",
                              "married_3.0": "married_previously"})

## Define a "liar" / "false confidence" / "validity" column

In [74]:
df['liar'] = (df['VCL6'] == 1.0) | (df['VCL9'] == 1.0) | (df['VCL12'] == 1.0)

## Use 0 and 1 instead of 1 and 2 for Y/N variables

In [38]:
df = df[df['gender'] < 3]
df.loc[df['gender'] == 2,'gender'] = 0.0
df.loc[df['engnat'] == 2,'engnat'] = 0.0
df.loc[df['voted'] == 2,'voted'] = 0.0

In [39]:
len(df)

3959

## Remove the old pre-dummy columns

In [40]:
df = df.drop(['operatingsystem','browser','E1','E2','E3','E4','E5','E6','E7','E8','E9','E10',\
              'N1','N2','N3','N4','N5','N6','N7','N8','N9','N10','A1','A2','A3','A4','A5',\
              'A6','A7','A8','A9','A10','C1','C2','C3','C4','C5','C6','C7','C8','C9','C10',\
              'O1','O2','O3','O4','O5','O6','O7','O8','O9','O10','GS1','GS2','GS3','GS4','GS5',\
              'GS6','GS7','GS8','GS9','GS10','GS11','GS12','country','VCL1','VCL2','VCL3','VCL4',\
              'VCL5','VCL6','VCL7','VCL8','VCL9','VCL10','VCL11','VCL12','VCL13','VCL14',\
              'VCL15','VCL16','race','orientation','religion','hand','urban','married'],axis=1)

In [41]:
df.head()

Unnamed: 0,surveyelapse,education,gender,engnat,age,voted,familysize,screenw,screenh,introelapse,...,orientation_other,race_asian,race_arab,race_black,race_white,race_other,married_never,married_currently,married_previously,liar
0,174.0,4.0,0.0,0.0,28.0,1.0,2.0,1366.0,768.0,69590.0,...,0,0,0,0,1,0,1,0,0,False
1,120.0,2.0,0.0,1.0,19.0,0.0,3.0,1280.0,800.0,33657.0,...,0,0,0,0,1,0,1,0,0,True
3,5098.0,3.0,0.0,1.0,30.0,1.0,6.0,1600.0,900.0,4.0,...,0,0,0,1,0,0,1,0,0,False
4,340.0,4.0,1.0,0.0,38.0,0.0,3.0,1920.0,1080.0,3.0,...,0,1,0,0,0,0,0,1,0,False
6,126.0,3.0,1.0,1.0,35.0,0.0,1.0,1366.0,768.0,36.0,...,0,0,0,0,1,0,0,0,1,False


## Save the cleaned data

In [42]:
df.to_csv("../duckworth-grit-scale-data/cleanData.csv")