# Somerville Happiness Survey

#### Importing the libraries

In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#### Set all the column names for training_data set

In [58]:
df = pd.read_csv('happiness.csv', names=['Combined_ID','Year','How.happy.do.you.feel.right.now.','How.satisfied.are.you.with.your.life.in.general.','How.satisfied.are.you.with.Somerville.as.a.place.to.live.','In.general..how.similar.are.you.to.other.people.you.know._2011','When.making.decisions..are.you.more.likely.to.seek.advice.or.decide.for.yourself._2011.','How.satisfied.are.you.with.your.neighborhood.','How.proud.are.you.to.be.a.Somerville.resident._2015','How.would.you.rate.the.following..The.availability.of.information.about.city.services._2015','The.availability.of.affordable.housing_2011','How.would.you.rate.the.following..The.cost.of.housing.','How.would.you.rate.the.following..The.overall.quality.of.public.schools.in.your.community._2011','How.would.you.rate.the.following..The.overall.quality.of.public.schools.','How.would.you.rate.the.following..The.beauty.or.physical.setting_2011','How.would.you.rate.the.following..The.beauty.or.physical.setting.of.Somerville_2013','How.would.you.rate.the.following..The.effectiveness.of.the.local.police_2011_2013','How.would.you.rate.the.following..Your.trust.in.the.local.police_2015','How.would.you.rate.the.following..The.maintenance.of.streets..sidewalks..and..squares_2013','How.would.you.rate.the.following..The.maintenance.of.streets.and.sidewalks_2015','How.would.you.rate.the.following..The.availability.of.social.community.events','How.safe.do.you.feel.walking.in.your.neighborhood.at.night_2013','How.safe.do.you.feel.walking.in.your.community.at.night._2015','How.satisfied.are.you.with.the.beauty.or.physical.setting.of.your.neighborhood.','How.satisfied.are.you.with.the.appearance.of.parks.in.your.neighborhood._2013','How.satisfied.are.you.with.the.appearance.of.parks.and.squares.in.your.neighborhood.','What.is.your.sex.','What.is.your.gender._2011','Age.','Marital.status._2011','What.language..other.than.English..do.you.speak.at.home._2015','What.is.your.race_2011_2013','Are.you.of.Hispanic..Latino..or.Spanish.origin._2013','What.is.your.race.or.ethnicity._2015','Do.you.have.children.age.18.or.younger.who.live.with.you.','Describe.your.housing.status.in.Somerville.','Do.you.plan.to.move.away.from.Somerville.in.the.next.two.years.','How.long.have.you.lived.here.','What.is.your.annual.household.income.','What.neighborhood.do.you.live.in.','Are.you.a.student.','Ward','Precinct'])


#### Data cleansing

In [59]:
df.drop(['How.satisfied.are.you.with.your.neighborhood.','How.would.you.rate.the.following..The.cost.of.housing.','How.satisfied.are.you.with.the.beauty.or.physical.setting.of.your.neighborhood.','What.is.your.sex.','Do.you.have.children.age.18.or.younger.who.live.with.you.','Precinct','Ward','Are.you.a.student.','How.would.you.rate.the.following..The.availability.of.social.community.events','What.neighborhood.do.you.live.in.','Do.you.plan.to.move.away.from.Somerville.in.the.next.two.years.','How.would.you.rate.the.following..The.overall.quality.of.public.schools.','How.would.you.rate.the.following..The.beauty.or.physical.setting.of.Somerville_2013','How.safe.do.you.feel.walking.in.your.neighborhood.at.night_2013','How.would.you.rate.the.following..The.maintenance.of.streets..sidewalks..and..squares_2013','How.satisfied.are.you.with.the.appearance.of.parks.in.your.neighborhood._2013','Are.you.of.Hispanic..Latino..or.Spanish.origin._2013','How.would.you.rate.the.following..The.maintenance.of.streets.and.sidewalks_2015','How.safe.do.you.feel.walking.in.your.community.at.night._2015','What.is.your.race.or.ethnicity._2015','How.would.you.rate.the.following..The.availability.of.information.about.city.services._2015','How.proud.are.you.to.be.a.Somerville.resident._2015','How.satisfied.are.you.with.the.appearance.of.parks.and.squares.in.your.neighborhood.','Describe.your.housing.status.in.Somerville.','How.would.you.rate.the.following..Your.trust.in.the.local.police_2015','What.language..other.than.English..do.you.speak.at.home._2015','Year'], axis=1, inplace=True)
df.dropna(inplace=True)

#### Data cleansing

In [60]:
cleanup_nums = {"Age.":{"31-40": 35, "26-30": 28, "61+": 61, "41-50": 45, "51-60":55, "22-25":23, "R":18, "18-21":19},                "How.long.have.you.lived.here.": {"0-5 Years": 2.5, "18+": 20, "6-11 Years": 8, "12-17 Years": 15,"R": 0},"What.is.your.annual.household.income.": {"150,000 or more": 151000, "100,000 and up":100000,"50,000 - $59,999":55000,"R":0,"40,000 - $49,999":45000,"60,000 - $69,999":65000,"30,000 - $39,999":35000,"20,000 - $29,999":25000,"70,000 - $79,999":75000,"80,000 - $89,999":85000,"10,000 - $19,999":15000,"90,000 - $99,999":95000,"Less than $10,000":5000}}
df.replace(cleanup_nums, inplace=True)

#### Clean main label

In [61]:
df['How.happy.do.you.feel.right.now.'] = pd.to_numeric(df['How.happy.do.you.feel.right.now.'].astype(str), errors='coerce').fillna(0).astype(int)
df['How.happy.do.you.feel.right.now.'] = df['How.happy.do.you.feel.right.now.'].apply(lambda x: np.where(x>=7,1,0))

#### Clean other satisfaction label

In [62]:
df['How.satisfied.are.you.with.your.life.in.general.'] = pd.to_numeric(df['How.satisfied.are.you.with.your.life.in.general.'].astype(str), errors='coerce').fillna(0).astype(int)
df['How.satisfied.are.you.with.your.life.in.general.'] = df['How.satisfied.are.you.with.your.life.in.general.'].apply(lambda x: np.where(x>=7,1,0))

#### Clean other satisfaction label

In [63]:
df['How.satisfied.are.you.with.Somerville.as.a.place.to.live.'] = pd.to_numeric(df['How.satisfied.are.you.with.Somerville.as.a.place.to.live.'].astype(str), errors='coerce').fillna(0).astype(int)
df['How.satisfied.are.you.with.Somerville.as.a.place.to.live.'] = df['How.satisfied.are.you.with.Somerville.as.a.place.to.live.'].apply(lambda x: np.where(x>=7,1,0))

#### Clean other columns

In [64]:
cols_to_transform = [ 'How.would.you.rate.the.following..The.effectiveness.of.the.local.police_2011_2013',
                      'What.is.your.race_2011_2013',
                      'What.is.your.gender._2011',
                      'Marital.status._2011',
                      'How.would.you.rate.the.following..The.beauty.or.physical.setting_2011',
                      'The.availability.of.affordable.housing_2011',
                      'How.would.you.rate.the.following..The.overall.quality.of.public.schools.in.your.community._2011',
                      'When.making.decisions..are.you.more.likely.to.seek.advice.or.decide.for.yourself._2011.',
                      'In.general..how.similar.are.you.to.other.people.you.know._2011']


In [65]:
for column in cols_to_transform:
    df[column] = pd.Categorical(df[column])
    df[column] = df[column].cat.codes

train_cols = df.columns[-14:]

#### Run Logistic Regression algorithm

In [67]:
df = df.apply(pd.to_numeric, errors='coerce', axis=1)
seed = 7
k=10
kFold = model_selection.KFold(n_splits=k, random_state=seed)
model = LogisticRegression(tol=.05)
results = model_selection.cross_val_score(model, df[train_cols], df['How.happy.do.you.feel.right.now.'], cv=kFold)
print(results.mean())

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').