In [36]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets import make_regression
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.compose import make_column_transformer

In [37]:
df = pd.read_csv('Resources/2016_SocialSurvey_Mil_AgeGrp.csv')

In [38]:
df.shape

(704, 128)

In [39]:
df.columns

Index(['Gss year for this respondent                       ',
       'Respondent id number', 'Labor force status',
       'Number of hours worked last week',
       'Number of hours usually work a week', 'Ever work as long as one year',
       'R self-emp or works for somebody', 'Rs census occupation code (1980)',
       'Rs industry code   (1980)', 'Rs census occupation code (2010)',
       ...
       'Importance of personal contact at work',
       'How often do you work from home', 'Do you work weekends',
       'How are your working hours decided', 'What is your working schedule',
       'I would change my work for something different',
       'Did you work multiple jobs in past yr',
       'How much earned from additional jobs in past yr', 'Job satisfaction',
       'Type of structure in which the respondent lives'],
      dtype='object', length=128)

In [40]:
df.isna().sum()

Gss year for this respondent                           0
Respondent id number                                   0
Labor force status                                     0
Number of hours worked last week                       0
Number of hours usually work a week                    0
                                                      ..
I would change my work for something different         0
Did you work multiple jobs in past yr                  0
How much earned from additional jobs in past yr        0
Job satisfaction                                       0
Type of structure in which the respondent lives        0
Length: 128, dtype: int64

In [41]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,Gss year for this respondent,Respondent id number,Labor force status,Number of hours worked last week,Number of hours usually work a week,Ever work as long as one year,R self-emp or works for somebody,Rs census occupation code (1980),Rs industry code (1980),Rs census occupation code (2010),...,Importance of personal contact at work,How often do you work from home,Do you work weekends,How are your working hours decided,What is your working schedule,I would change my work for something different,Did you work multiple jobs in past yr,How much earned from additional jobs in past yr,Job satisfaction,Type of structure in which the respondent lives
0,2016,8,Working parttime,30,Not applicable,Not applicable,Someone else,Not applicable,Not applicable,Personal care aides,...,Very important,Sometimes,Always,I have a schedule or shift which regularly cha...,I have a schedule or shift which regularly cha...,Strongly agree,No,Not applicable,Not applicable,"2-FAMILY HOUSE, 2 UNITS ONE ABOVE THE OTHER"
1,2016,11,Keeping house,Not applicable,Not applicable,Yes,Someone else,Not applicable,Not applicable,"Nursing, psychiatric, and home health aides",...,Very important,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,DETACHED SINGLE FAMILY HOUSE
2,2016,16,Working fulltime,42,Not applicable,Not applicable,Someone else,Not applicable,Not applicable,Designers,...,Important,Never,Always,Starting and finishing times are decided by my...,"I have a regular schedule or shift (daytime, e...",Strongly disagree,No,Not applicable,Not applicable,"2-FAMILY HOUSE, 2 UNITS SIDE-BY-SIDE"
3,2016,19,Working fulltime,40,Not applicable,Not applicable,Someone else,Not applicable,Not applicable,Elementary and middle school teachers,...,Neither important nor unimportant,Hardly ever,Sometimes,Starting and finishing times are decided by my...,"I have a regular schedule or shift (daytime, e...",Agree,No,Not applicable,Not applicable,"APARTMENT HOUSE (5 OR MORE UNITS, 4 STORIES OR..."
4,2016,23,Working parttime,15,Not applicable,Not applicable,Someone else,Not applicable,Not applicable,Cashiers,...,Important,Never,Always,Starting and finishing times are decided by my...,I have a schedule or shift which regularly cha...,Strongly agree,No,Not applicable,Not applicable,"APARTMENT HOUSE (5 OR MORE UNITS, 3 STORIES OR..."


In [43]:
df.shape

(704, 128)

In [44]:
pd.get_dummies(df).shape

(704, 1088)

In [45]:
for col in df.columns:
    if len(df[col].unique()) == 1:
        df.drop(col,inplace=True,axis=1)
df.shape

(704, 94)

In [46]:
df

Unnamed: 0,Respondent id number,Labor force status,Number of hours worked last week,Number of hours usually work a week,Ever work as long as one year,R self-emp or works for somebody,Rs census occupation code (2010),Number of children,Age of respondent,Highest year of school completed,...,Importance of personal contact at work,How often do you work from home,Do you work weekends,How are your working hours decided,What is your working schedule,I would change my work for something different,Did you work multiple jobs in past yr,How much earned from additional jobs in past yr,Job satisfaction,Type of structure in which the respondent lives
0,8,Working parttime,30,Not applicable,Not applicable,Someone else,Personal care aides,3,23,11,...,Very important,Sometimes,Always,I have a schedule or shift which regularly cha...,I have a schedule or shift which regularly cha...,Strongly agree,No,Not applicable,Not applicable,"2-FAMILY HOUSE, 2 UNITS ONE ABOVE THE OTHER"
1,11,Keeping house,Not applicable,Not applicable,Yes,Someone else,"Nursing, psychiatric, and home health aides",5,33,12,...,Very important,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,DETACHED SINGLE FAMILY HOUSE
2,16,Working fulltime,42,Not applicable,Not applicable,Someone else,Designers,2,33,13,...,Important,Never,Always,Starting and finishing times are decided by my...,"I have a regular schedule or shift (daytime, e...",Strongly disagree,No,Not applicable,Not applicable,"2-FAMILY HOUSE, 2 UNITS SIDE-BY-SIDE"
3,19,Working fulltime,40,Not applicable,Not applicable,Someone else,Elementary and middle school teachers,0,31,18,...,Neither important nor unimportant,Hardly ever,Sometimes,Starting and finishing times are decided by my...,"I have a regular schedule or shift (daytime, e...",Agree,No,Not applicable,Not applicable,"APARTMENT HOUSE (5 OR MORE UNITS, 4 STORIES OR..."
4,23,Working parttime,15,Not applicable,Not applicable,Someone else,Cashiers,2,23,10,...,Important,Never,Always,Starting and finishing times are decided by my...,I have a schedule or shift which regularly cha...,Strongly agree,No,Not applicable,Not applicable,"APARTMENT HOUSE (5 OR MORE UNITS, 3 STORIES OR..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,2833,Working fulltime,50,Not applicable,Not applicable,Someone else,Elementary and middle school teachers,0,27,18,...,Very important,Always,Often,Starting and finishing times are decided by my...,"I have a regular schedule or shift (daytime, e...",Strongly disagree,No,Not applicable,Not applicable,DETACHED SINGLE FAMILY HOUSE
700,2839,Working fulltime,37,Not applicable,Not applicable,Someone else,"Bookkeeping, accounting, and auditing clerks",1,30,16,...,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,DETACHED SINGLE FAMILY HOUSE
701,2845,Working fulltime,40,Not applicable,Not applicable,Someone else,Teacher assistants,0,24,14,...,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Very satisfied,DETACHED 3-4 FAMILY HOUSE
702,2850,Working fulltime,40,Not applicable,Not applicable,Someone else,"Laborers and freight, stock, and material move...",1,34,12,...,Important,Never,Sometimes,Starting and finishing times are decided by my...,"I have a regular schedule or shift (daytime, e...",Strongly agree,No,Not applicable,Not applicable,TRAILER


In [51]:
df.columns

Index(['Respondent id number', 'Labor force status',
       'Number of hours worked last week',
       'Number of hours usually work a week', 'Ever work as long as one year',
       'R self-emp or works for somebody', 'Rs census occupation code (2010)',
       'Number of children', 'Age of respondent',
       'Highest year of school completed', 'Rs highest degree',
       'Respondents sex', 'Race of respondent', 'Was r born in this country',
       'Number of persons in household', 'How many in family earned money',
       'Total family income', 'Respondents income', 'Health -- version y',
       'General happiness', 'Condition of health', 'Is life exciting or dull',
       'To work hard', 'Weeks r. worked last year',
       'Was r's work part-time or full-time?',
       'Satisfaction with financial situation', 'Opinion of family income',
       'Ever unemployed in last ten yrs', 'Opinion of how people get ahead',
       'Rs living standard compared to parents',
       'Days of poor me

In [58]:
df = df.drop(columns=['Respondent id number',
        'Number of hours usually work a week',
        'Ever work as long as one year',
        'Use internet/apps more than occassionally',
       'Use internet/apps from phone or tablet',
       'Used internet/apps yesterday', 'Use twitter', 'Use facebook',
       'Use instagrm', 'Use linkedin', 'Use snapchat', 'Use tumblr',
       'Use whatsapp', 'Use googlesn', 'Use pinterst', 'Use flickr',
       'Use vine', 'Use clssmtes', 'Minutes of internet use on weekdays',
       'Hours of internet use on weekdays',
       'Minutes of internet use on weekends',
       'Hours of internet use on weekends'])

In [53]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [59]:
df.columns

Index(['Labor force status', 'Number of hours worked last week',
       'R self-emp or works for somebody', 'Rs census occupation code (2010)',
       'Number of children', 'Age of respondent',
       'Highest year of school completed', 'Rs highest degree',
       'Respondents sex', 'Race of respondent', 'Was r born in this country',
       'Number of persons in household', 'How many in family earned money',
       'Total family income', 'Respondents income', 'Health -- version y',
       'General happiness', 'Condition of health', 'Is life exciting or dull',
       'To work hard', 'Weeks r. worked last year',
       'Was r's work part-time or full-time?',
       'Satisfaction with financial situation', 'Opinion of family income',
       'Ever unemployed in last ten yrs', 'Opinion of how people get ahead',
       'Rs living standard compared to parents',
       'Days of poor mental health past 30 days',
       'How often does r find work stressful',
       'The highest degree r have ea

In [62]:
df.dtypes

Labor force status                                 object
Number of hours worked last week                   object
R self-emp or works for somebody                   object
Rs census occupation code (2010)                   object
Number of children                                 object
                                                    ...  
I would change my work for something different     object
Did you work multiple jobs in past yr              object
How much earned from additional jobs in past yr    object
Job satisfaction                                   object
Type of structure in which the respondent lives    object
Length: 72, dtype: object

In [47]:
x = df.iloc[:,:].values
z_pre = pd.DataFrame(x)

In [48]:
x

array([[8, 'Working parttime', '30', ..., 'Not applicable',
        'Not applicable', '2-FAMILY HOUSE, 2 UNITS ONE ABOVE THE OTHER'],
       [11, 'Keeping house', 'Not applicable', ..., 'Not applicable',
        'Not applicable', 'DETACHED SINGLE FAMILY HOUSE'],
       [16, 'Working fulltime', '42', ..., 'Not applicable',
        'Not applicable', '2-FAMILY HOUSE, 2 UNITS SIDE-BY-SIDE'],
       ...,
       [2845, 'Working fulltime', '40', ..., 'Not applicable',
        'Very satisfied', 'DETACHED 3-4 FAMILY HOUSE'],
       [2850, 'Working fulltime', '40', ..., 'Not applicable',
        'Not applicable', 'TRAILER'],
       [2857, 'Working fulltime', '40', ..., 'Not applicable',
        'Very dissatisfied', 'DETACHED SINGLE FAMILY HOUSE']],
      dtype=object)

In [49]:
z_pre

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,84,85,86,87,88,89,90,91,92,93
0,8,Working parttime,30,Not applicable,Not applicable,Someone else,Personal care aides,3,23,11,...,Very important,Sometimes,Always,I have a schedule or shift which regularly cha...,I have a schedule or shift which regularly cha...,Strongly agree,No,Not applicable,Not applicable,"2-FAMILY HOUSE, 2 UNITS ONE ABOVE THE OTHER"
1,11,Keeping house,Not applicable,Not applicable,Yes,Someone else,"Nursing, psychiatric, and home health aides",5,33,12,...,Very important,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,DETACHED SINGLE FAMILY HOUSE
2,16,Working fulltime,42,Not applicable,Not applicable,Someone else,Designers,2,33,13,...,Important,Never,Always,Starting and finishing times are decided by my...,"I have a regular schedule or shift (daytime, e...",Strongly disagree,No,Not applicable,Not applicable,"2-FAMILY HOUSE, 2 UNITS SIDE-BY-SIDE"
3,19,Working fulltime,40,Not applicable,Not applicable,Someone else,Elementary and middle school teachers,0,31,18,...,Neither important nor unimportant,Hardly ever,Sometimes,Starting and finishing times are decided by my...,"I have a regular schedule or shift (daytime, e...",Agree,No,Not applicable,Not applicable,"APARTMENT HOUSE (5 OR MORE UNITS, 4 STORIES OR..."
4,23,Working parttime,15,Not applicable,Not applicable,Someone else,Cashiers,2,23,10,...,Important,Never,Always,Starting and finishing times are decided by my...,I have a schedule or shift which regularly cha...,Strongly agree,No,Not applicable,Not applicable,"APARTMENT HOUSE (5 OR MORE UNITS, 3 STORIES OR..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,2833,Working fulltime,50,Not applicable,Not applicable,Someone else,Elementary and middle school teachers,0,27,18,...,Very important,Always,Often,Starting and finishing times are decided by my...,"I have a regular schedule or shift (daytime, e...",Strongly disagree,No,Not applicable,Not applicable,DETACHED SINGLE FAMILY HOUSE
700,2839,Working fulltime,37,Not applicable,Not applicable,Someone else,"Bookkeeping, accounting, and auditing clerks",1,30,16,...,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,DETACHED SINGLE FAMILY HOUSE
701,2845,Working fulltime,40,Not applicable,Not applicable,Someone else,Teacher assistants,0,24,14,...,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Very satisfied,DETACHED 3-4 FAMILY HOUSE
702,2850,Working fulltime,40,Not applicable,Not applicable,Someone else,"Laborers and freight, stock, and material move...",1,34,12,...,Important,Never,Sometimes,Starting and finishing times are decided by my...,"I have a regular schedule or shift (daytime, e...",Strongly agree,No,Not applicable,Not applicable,TRAILER


In [50]:
labelEncoder_x = LabelEncoder()
x[:,:]= labelEncoder_x.fit_transform(x[:,:])
z = pd.DataFrame(x)

ValueError: bad input shape (704, 94)