In [1]:
# With the goal above, I will import just what I need. 
# The model to use (I already imported it above, but will do it again here so each example is self-contained)
from sklearn.ensemble import RandomForestRegressor

# The error metric. In this case, we will use c-stat (aka ROC/AUC)
from sklearn.metrics import roc_auc_score

# An efficient data structure. 
import pandas as pd
import numpy as np

In [2]:
get_ipython().magic(u'pylab inline')
get_ipython().magic(u'matplotlib inline')
# Import the data
X = pd.read_csv("RedditShortDemoSurvey-1-Cleaned.csv")
X.columns  # output the column names

# Standardize Column Names
X.columns = ['Entry_Id', 'Gender', 'Categorical_Age', 'Martial_Status', 'Employment_Status', 'Military_Service', 'Household_Dependents', 'Education', 'Country', 'US_State', 'Annual_Gross_Income', 'Subreddit', 'Dog_OR_Cat', 'Cheese', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19']
y = X.pop("Annual_Gross_Income")

Populating the interactive namespace from numpy and matplotlib


In [3]:
X.drop(["US_State"], axis=1, inplace=True)  # drop us states

In [4]:
X.describe(include='all')

Unnamed: 0,Entry_Id,Gender,Categorical_Age,Martial_Status,Employment_Status,Military_Service,Household_Dependents,Education,Country,Subreddit,Dog_OR_Cat,Cheese,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
count,32754.0,32754,32754,32754,32754,32754,32754,32754,32754,28419,32752,32754,3,3,1,1,1,1
unique,,3,8,7,7,3,3,8,440,1833,7,17,3,2,1,1,1,1
top,,Male,18-24,Single,Employed full time,No,No,Bachelor's degree,United States,askreddit,I like dogs.,Other,7/13/11 15:47,public,I like cats.,Other,7/14/11 13:22,public
freq,,26418,15802,10428,14814,30526,27488,11046,20967,2123,17149,6562,1,2,1,1,1,1
mean,16378.561916,,,,,,,,,,,,,,,,,
std,9456.272427,,,,,,,,,,,,,,,,,
min,1.0,,,,,,,,,,,,,,,,,
25%,8189.25,,,,,,,,,,,,,,,,,
50%,16379.5,,,,,,,,,,,,,,,,,
75%,24567.75,,,,,,,,,,,,,,,,,


In [5]:
import csv

# read csv file into a nested list

def get_csv(file1):
    returnlist = []
    with open(file1, 'U') as csvfile:
        freader = csv.reader(csvfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for row in freader:
            returnlist.append(row)
            
    return returnlist

countries_data = get_csv("countries.csv")  # data from https://www.worlddata.info/download/countries.csv


In [6]:
countries_data # verify populated

[['Country (en)',
  'Country (de)',
  'Country (local)',
  'Country code',
  'Continent',
  'Capital',
  'Population',
  'Area',
  'Coastline',
  'Government form',
  'Currency',
  'Currency code',
  'Dialing prefix',
  'Birthrate',
  'Deathrate',
  'Life expectancy',
  'Url'],
 ['Afghanistan',
  'Afghanistan',
  'Afganistan/Afqanestan',
  'AF',
  'Asia',
  '',
  '32564342',
  '652230',
  '0',
  'islamic republic',
  'Afghani',
  'AFN',
  '93',
  '38.6',
  '13.9',
  '50.9',
  'https://www.laenderdaten.info/Asien/Afghanistan/index.php'],
 ['Egypt',
  '\xc3\x84gypten',
  'Misr',
  'EG',
  'Africa',
  '',
  '88487396',
  '1001450',
  '2450',
  'republic',
  'Pfund',
  'EGP',
  '20',
  '22.9',
  '4.8',
  '73.7',
  'https://www.laenderdaten.info/Afrika/Aegypten/index.php'],
 ['Albania',
  'Albanien',
  'Shqip\xc3\xabria',
  'AL',
  'Europe',
  '',
  '3029278',
  '28748',
  '362',
  'republic',
  'Lek',
  'ALL',
  '355',
  '12.9',
  '6.6',
  '78.1',
  'https://www.laenderdaten.info/Europa/Al

In [7]:
#handle data for continent population

countries = []
continents = []

for country in countries_data:
    countries.append(country[0].lower())
    
    
for continent in countries_data:
    continents.append(continent[4].lower())
    
countries = countries[1:]  #remove header label
continents = continents[1:]  #remove header label

country_dict = {}  #dictionary that will hold the country as key and continent as value 

for entry in range(len(countries)):
    country_dict[countries[entry]] = continents[entry]

In [8]:
len(country_dict)

247

In [9]:
X.drop(["Unnamed: 14", "Unnamed: 15", "Unnamed: 16", "Unnamed: 17", "Unnamed: 18", "Unnamed: 19", "Entry_Id"], axis=1, inplace=True)



In [10]:
X

Unnamed: 0,Gender,Categorical_Age,Martial_Status,Employment_Status,Military_Service,Household_Dependents,Education,Country,Subreddit,Dog_OR_Cat,Cheese
0,Male,25-34,Blank,Employed full time,none,No,Bachelor's degree,United States,getmotivated,none,none
1,Male,25-34,Blank,Employed full time,none,No,Bachelor's degree,United States,gaming,none,none
2,Female,18-24,Blank,Freelance,none,No,Some college,United States,snackexchange,none,none
3,Male,25-34,Blank,Freelance,none,No,Bachelor's degree,United States,spacedicks,none,none
4,Female,25-34,Blank,Employed full time,none,No,Bachelor's degree,United States,aww,none,none
5,Male,25-34,Married/civil union/domestic partnership,Employed full time,No,No,Bachelor's degree,United States,gaming,I like dogs.,Cheddar
6,Male,25-34,In a relationship,Employed full time,No,No,Some college,Germany,python,I like dogs.,Cheddar
7,Male,18-24,Married/civil union/domestic partnership,"Not employed, but looking for work",No,No,Bachelor's degree,United States,fffffffuuuuuuuuuuuu,I like dogs.,American
8,Male,35-44,Married/civil union/domestic partnership,Employed full time,No,No,Bachelor's degree,United States,todayilearned,I like cats.,Provolone
9,Male,25-34,In a relationship,Freelance,No,No,Some college,Germany,,I like cats.,Munster


In [11]:
X['hasDependents'] = np.where((X['Household_Dependents'] == 'Yes') | (X['Household_Dependents'] ==  'None'), 1, 0)

In [12]:
X.Household_Dependents.value_counts()
X.hasDependents.value_counts()

0    27488
1     5266
Name: hasDependents, dtype: int64

In [13]:
continent_list = []

for entry in X['Country']:
    cleaned = entry.lower()
    cleaned = cleaned.strip()
    if ('ireland' in cleaned):
        continent_list.append(country_dict['ireland'])
    elif ((cleaned == 'the netherlands') | (cleaned == 'holland')):
        continent_list.append(country_dict['netherlands'])
    elif (('united states' in cleaned) | ('america' in cleaned) | (cleaned == 'united strate') | (cleaned == 'u.s.a.') |(cleaned == 'california') | (cleaned == 'usa') | (cleaned == 'us') | (cleaned == 'united stateds')):
        continent_list.append(country_dict['united states'])
    elif ((cleaned == 'united states of america.') | (cleaned == 'us of a') | (cleaned == 'idaho') | (cleaned == 'amerikka') | (cleaned == 'united states of america') | (cleaned == 'united states of america (washington, dc)')):
        continent_list.append(country_dict['united states'] )
    elif ((cleaned == 'united sates') | (cleaned == "united sr") | (cleaned == 'u.s') | (cleaned == 'unites states') | (cleaned == 'united states of') | (cleaned == 'united states :f') | (cleaned == 'united states of american') | (cleaned == 'united states of america (washington dc, y u no listed?)')):
        continent_list.append(country_dict['united states'])
    elif ((cleaned == 'england') | (cleaned == 'peoples republic of south yorkshire') | (cleaned == 'united kingdon') | (cleaned == 'wales') | (cleaned == 'northern ireland')):
        continent_list.append(country_dict['united kingdom'])
    elif (('england' in cleaned) | ('kingdom' in cleaned) | ('britain' in cleaned) | ('britian' in cleaned) | ('u.k' in cleaned) | (cleaned == 'united kingdown') | (('united ki' in cleaned))):
        continent_list.append(country_dict['united kingdom'])
    elif (('scotland' in cleaned) | (cleaned == "united jingdom") | (cleaned == 'u.k') | (cleaned == 'u.k.') | (cleaned == 'uk')):
        continent_list.append(country_dict['united kingdom'])
    elif (cleaned == 'm\xc3\xa9xico'):
        continent_list.append(country_dict['mexico'])
    elif (cleaned == 'korea, south'):
        continent_list.append(country_dict['south korea'])
    elif ((cleaned == 'cananda') | (cleaned == 'canada, eh?') | (cleaned == 'canadia') | (cleaned == 'toronto') | (cleaned == 'canda') | (cleaned == 'camada')):
        continent_list.append(country_dict['canada'])
    elif ('canada' in cleaned):
        continent_list.append(country_dict['canada'])
    elif (('germany' in cleaned) | (cleaned == 'ger')):
        continent_list.append(country_dict['germany'])
    elif ('russia' in cleaned):
        continent_list.append(country_dict['russia'])
    elif ('poland' in cleaned):
        continent_list.append(country_dict['poland'])
    elif ('united arab' in cleaned):
        continent_list.append(country_dict['united arab emirates'])
    elif ('moldova' in cleaned):
        continent_list.append(country_dict['moldova'])
    elif (('swe' in cleaned) | (cleaned == 'swden')):
        continent_list.append(country_dict['sweden'])
    elif ('singapore' in cleaned):
        continent_list.append(country_dict['singapore'])
    elif (('china' in cleaned) | (cleaned == 'hong')):
        continent_list.append(country_dict['china'])
    elif (cleaned == 'swiss'):
        continent_list.append(country_dict['switzerland'])
    elif ('fra' in cleaned):
        continent_list.append(country_dict['france'])
    elif (('trinidad' in cleaned) | ('tobago' in cleaned)):
        continent_list.append(country_dict['trinidad and tobago'])
    elif (('holy see' in cleaned)):
        continent_list.append(country_dict['holy see (vatican city)'])
    elif (('austral' in cleaned) | ('stralia' in cleaned)):
        continent_list.append(country_dict['australia'])
    elif (('macedonia' in cleaned)):
        continent_list.append(country_dict['macedonia'])
    elif (('czech' in cleaned)):
        continent_list.append(country_dict['czech republic'])
    elif (('norway' in cleaned)):
        continent_list.append(country_dict['norway'])
    elif (('iraq' in cleaned)):
        continent_list.append(country_dict['iraq'])
    elif ((cleaned == 'brasil')):
        continent_list.append(country_dict['brazil'])
    elif ((cleaned == 'none') | (cleaned == 'n') | ('/' in cleaned) | (cleaned == 'equestria') | (cleaned == 'peter poppins') | (cleaned == 'united') | (cleaned == 'eire') | (cleaned == 'slovak republic')):
        continent_list.append(None)
    elif ((cleaned == 'basque country') | (cleaned == 'catalonia') | (cleaned == 'brunei darussalem') | (cleaned == 'sv') | (cleaned == 'random') | (cleaned == 'people') | (len(cleaned) > 50) | (cleaned == 'd.k:') | (cleaned == 'grand duchy of baden') | (cleaned == 'vou0302833') | (cleaned == 'kosova')):
        continent_list.append(None)
    else:
        try:
            continent_list.append(country_dict[cleaned])
        except KeyError:
            continent_list.append(None)
    
X['Continent'] = continent_list

In [14]:
X.drop(["Household_Dependents", "Country"], axis=1, inplace=True)

X = X.dropna(axis=0)  # drop missing values 

In [15]:
X.describe(include='all')

Unnamed: 0,Gender,Categorical_Age,Martial_Status,Employment_Status,Military_Service,Education,Subreddit,Dog_OR_Cat,Cheese,hasDependents,Continent
count,28321,28321,28321,28321,28321,28321,28321,28321,28321,28321.0,28321
unique,3,8,7,7,3,8,1824,7,15,,9
top,Male,18-24,Single,Employed full time,No,Bachelor's degree,askreddit,I like dogs.,Other,,north america
freq,22931,14038,9132,12587,26463,9444,2118,14711,5628,,21246
mean,,,,,,,,,,0.158822,
std,,,,,,,,,,0.365517,
min,,,,,,,,,,0.0,
25%,,,,,,,,,,0.0,
50%,,,,,,,,,,0.0,
75%,,,,,,,,,,0.0,


In [None]:
categorical_variables = ['Gender', 'Martial_Status', 'Employment_Status', 'Military_Service', 'Education', 'Subreddit', 'Dog_OR_Cat', ]

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)