# University of Texas, Dallas - Hackathon Oct 2017


## Problem 3: Somerville Happiness Survey
                


In [None]:
## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics
%matplotlib inline

#Loading Data
city = pd.read_csv('https://data.somervillema.gov/api/views/yevj-2b33/rows.csv')
city.describe()
city.groupby(by='Year',axis=0).count()

#Divided the dataset by year
city_2011 = city[city.Year == 2011]
city_2013 = city[city.Year == 2013]
city_2015 = city[city.Year == 2015]

#Creating a list of the columns with all values as NA for year 2011
na_col_list = city_2011.columns[city_2011.isnull().all()].tolist()

city_2011_nonull = city_2011.drop(na_col_list,axis=1)

city_2011_nonull.shape

city_2011_nonull.head(5)
contents2011 = city_2011_nonull.columns
contents2011

###Dropping the columns with null values for Year 2013 and 2015
### This is for the 2013 year
na_col_list = city_2013.columns[city_2013.isnull().all()].tolist()
city_2013_nonull = city_2013.drop(na_col_list,axis=1)

### This is for the 2015 year
na_col_list = city_2015.columns[city_2015.isnull().all()].tolist()
city_2015_nonull = city_2015.drop(na_col_list,axis=1)

contents2015 = city_2015_nonull.columns

### Keeping the columns which are common in all years

rem2011 = ['The.availability.of.affordable.housing_2011',
           'How.would.you.rate.the.following..The.beauty.or.physical.setting_2011',
           'When.making.decisions..are.you.more.likely.to.seek.advice.or.decide.for.yourself._2011.',
           'In.general..how.similar.are.you.to.other.people.you.know._2011',
            'Marital.status._2011']

city_2011_raw = city_2011_nonull.drop(rem2011, axis=1)

city_2011_raw.shape

rem2013 = ['How.would.you.rate.the.following..The.cost.of.housing.',
           'How.would.you.rate.the.following..The.beauty.or.physical.setting.of.Somerville_2013',
           'How.satisfied.are.you.with.your.neighborhood.',
           'How.would.you.rate.the.following..The.availability.of.social.community.events',
           'How.safe.do.you.feel.walking.in.your.neighborhood.at.night_2013',
           'How.would.you.rate.the.following..The.maintenance.of.streets..sidewalks..and..squares_2013',
           'How.satisfied.are.you.with.the.beauty.or.physical.setting.of.your.neighborhood.',
           'How.satisfied.are.you.with.the.appearance.of.parks.in.your.neighborhood._2013',
           'Are.you.of.Hispanic..Latino..or.Spanish.origin._2013',
           'Do.you.have.children.age.18.or.younger.who.live.with.you.',
           'Do.you.plan.to.move.away.from.Somerville.in.the.next.two.years.',
           'What.neighborhood.do.you.live.in.', 'Are.you.a.student.', 'Ward',
           'Precinct']

city_2013_raw = city_2013_nonull.drop(rem2013,axis = 1)

### Delelting the columns from 2015 which are not there in 2011

rem2015 = ['How.proud.are.you.to.be.a.Somerville.resident._2015',
          'How.would.you.rate.the.following..The.availability.of.information.about.city.services._2015',
          'How.would.you.rate.the.following..The.cost.of.housing.',
          'How.would.you.rate.the.following..The.maintenance.of.streets.and.sidewalks_2015',
          'How.would.you.rate.the.following..The.availability.of.social.community.events',
          'How.safe.do.you.feel.walking.in.your.community.at.night._2015',
          'How.satisfied.are.you.with.the.beauty.or.physical.setting.of.your.neighborhood.',
          'How.satisfied.are.you.with.the.appearance.of.parks.and.squares.in.your.neighborhood.',
          'What.language..other.than.English..do.you.speak.at.home._2015',
          'Do.you.have.children.age.18.or.younger.who.live.with.you.',
          'Describe.your.housing.status.in.Somerville.',
          'Do.you.plan.to.move.away.from.Somerville.in.the.next.two.years.',
          'What.neighborhood.do.you.live.in.', 'Are.you.a.student.', 'Ward',
          'How.satisfied.are.you.with.your.neighborhood.',
          'Precinct']

city_2015_raw = city_2015_nonull.drop(rem2015, axis=1)

city_2015_raw.shape

### Modifying the column names for better data consistency

renamed_cols = ['Combined_ID','year','happiness_index','satisfaction_life_index','satisfaction_place_index','quality_public_schools',
'local_police_effectiveness','sex','age','race','time_lived','annual_household_income']

city_2011_raw.columns = renamed_cols

city_2015_raw.columns = renamed_cols

city_2013_raw.columns = renamed_cols

### Merged the three datasets 

data = [city_2011_raw,city_2013_raw,city_2015_raw]

city_final = pd.concat(data)

city_final.shape

#### Dropping the combinedID columns as Column ID is not significant for the model

city_final1 = city_final.drop('Combined_ID',axis=1)


### Dropping the rows having Null target value 

city_final1 = city_final1[city_final1.happiness_index.notnull()]

### Removing non-numeric values from the entire dataset

df = city_final1.applymap(lambda x: 'NA' if (x == '9*' or x == 'R' or x == 'i' or x == 'X' or x == 'r' or x == 'F' or x == 'n'
                                             or x == 'e' or x == '3f' or x == '5/27/2011' or x == '5/19/2011'
                                            or x == '6/6/2011' or x == '6/1/2011') else x)

len(df)

df1 = df.dropna(axis=0,how='any')

len(df1)

### Extracting the Target Variable

city_final_target = df1['happiness_index']

city_final_var = df1.drop('happiness_index',axis=1)

### Creating the dummy variables for the categorical variables

city_final_var = pd.get_dummies(city_final_var)
city_final_var.shape

### Splitting the dataset into Train and Test sets: 70 - 30

X_train, X_test, y_train, y_test = train_test_split(city_final_var, city_final_target , test_size=0.3, random_state=0)

happiness_model1 = LogisticRegression(random_state=0)

happiness_model = happiness_model1.fit(X_train,y_train)

### Building Confusion Matrix

Y_HAT = happiness_model.predict(X_test)

conf_matrix = confusion_matrix(y_test,Y_HAT)

print(conf_matrix)

### Calculating the Accuracy of the model

acc = happiness_model.score(X_test,y_test)
print('Accuracy of logistic regression classifier on test set {:.2f} '.format(happiness_model.score(X_test,y_test)))

### Calculating Recall and Precision
print (metrics.classification_report(y_test, Y_HAT))

In [None]:
#this is assertion block which verifies the solution.

try:
    def verify_answer(x):

         
        if x > .50: 
            return True
        else:
            return False
        

    ref_assert_var = verify_answer(acc)
except Exception as e:
    print('Your assertion block throws error: ' + str(e))
else:
    if ref_assert_var:
        print('Well done team')
    else:
        print('The answer did not pass the test.')