# Feature Engineering 2: dimensionality and PCA

### 1. Create a new score variable from the weighted averages of all score variables in the datasets. **Notice that the number of students in the 4th grade isn't the same as the number of students in the 8th grade. So, you should appropriately weigh the scores!**.

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from scipy.stats.mstats import winsorize
from sqlalchemy import create_engine
import warnings

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

In [2]:
# Establish postgres access details
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'useducation'

# Start the engine and connect to the database
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

# Perform a select query
education_df = pd.read_sql_query('SELECT * FROM useducation', con=engine)

# Dispose the engine. We don't need it anymore
engine.dispose()

In [3]:
education_df.head()

Unnamed: 0,PRIMARY_KEY,STATE,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,...,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G,AVG_MATH_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_4_SCORE,AVG_READING_8_SCORE
0,1992_ALABAMA,ALABAMA,1992,,2678885.0,304177.0,1659028.0,715680.0,2653798.0,1481703.0,...,57948.0,58025.0,41167.0,471564.0,196386.0,676174.0,208.327876,252.187522,207.963517,
1,1992_ALASKA,ALASKA,1992,,1049591.0,106780.0,720711.0,222100.0,972488.0,498362.0,...,9748.0,8789.0,6714.0,79117.0,30847.0,112335.0,,,,258.859712
2,1992_ARIZONA,ARIZONA,1992,,3258079.0,297888.0,1369815.0,1590376.0,3401580.0,1435908.0,...,55433.0,49081.0,37410.0,437127.0,175210.0,614881.0,215.253932,265.366278,206.212716,262.169895
3,1992_ARKANSAS,ARKANSAS,1992,,1711959.0,178571.0,958785.0,574603.0,1743022.0,964323.0,...,34632.0,36011.0,27651.0,281338.0,123113.0,405259.0,210.206028,256.31209,208.634458,264.619665
4,1992_CALIFORNIA,CALIFORNIA,1992,,26260025.0,2072470.0,16546514.0,7641041.0,27138832.0,14358922.0,...,418418.0,363296.0,270675.0,3286034.0,1372011.0,4717112.0,208.398961,260.892247,196.764414,


In [10]:
def missingness_summary(df, print_log, sort):
    s = df.isna().sum()*100/len(df)
    if sort == 'asc':
        s.sort_values(ascending=True, inplace=True)
    elif sort == 'desc':
        s.sort_values(ascending=False, inplace=True)
    if print_log == True:
        print(s)
    return s

missingness_summary(education_df, True, 'desc')

# We'll use linear interpolation to fill this dataset
education_df = education_df.interpolate(method='linear')

# Since we'll have some leftover nulls at the bounds, we'll use grouped medians to fill in the rest
for year in education_df['YEAR'].unique():
    education_df = education_df.fillna(education_df.loc[inter_df['YEAR'] == year].median())


print('Summary after filling nulls')
missingness_summary(education_df, True, 'desc')
print('\nThat seemed to fill everything just fine!')

AVG_READING_8_SCORE             0.0
OTHER_EXPENDITURE               0.0
STATE                           0.0
YEAR                            0.0
ENROLL                          0.0
TOTAL_REVENUE                   0.0
FEDERAL_REVENUE                 0.0
STATE_REVENUE                   0.0
LOCAL_REVENUE                   0.0
TOTAL_EXPENDITURE               0.0
INSTRUCTION_EXPENDITURE         0.0
SUPPORT_SERVICES_EXPENDITURE    0.0
CAPITAL_OUTLAY_EXPENDITURE      0.0
AVG_READING_4_SCORE             0.0
GRADES_PK_G                     0.0
GRADES_KG_G                     0.0
GRADES_4_G                      0.0
GRADES_8_G                      0.0
GRADES_12_G                     0.0
GRADES_1_8_G                    0.0
GRADES_9_12_G                   0.0
GRADES_ALL_G                    0.0
AVG_MATH_4_SCORE                0.0
AVG_MATH_8_SCORE                0.0
PRIMARY_KEY                     0.0
dtype: float64
Summary after filling nulls
AVG_READING_8_SCORE             0.0
OTHER_EXPENDITURE    

In [26]:
# Now for the outliers. Let's identify them

# ----- FUNCTIONS FOR GENERAL OUTLIER HANDLING --------------|
def get_minmax_with_threshold(s, threshold):
    q75, q25 = np.percentile(s, [75,25])
    iqr = q75 - q25
    min_val = q25 - (iqr*threshold)
    max_val = q75 + (iqr*threshold)
    
    return min_val, max_val
    
def get_outliers(s, threshold):
    min_val, max_val = get_minmax_with_threshold(s, threshold)
    return s.loc[(s > max_val) | (s < min_val)]

def outliers_summary(df, threshold, print_log, sort):    
    s = pd.Series([get_outliers(df[col], threshold).count() *100 / len(df[col])
                   for col in df.select_dtypes(include='number').columns],
                 index=df.select_dtypes(include='number').columns)
    
    if sort == 'asc':
        s.sort_values(ascending=True, inplace=True)
    elif sort == 'desc':
        s.sort_values(ascending=False, inplace=True)
    if print_log == True:
        print(s)
        
    return s

def get_percentiles(df, column_name, threshold):
    min_val, max_val = get_minmax_with_threshold(df[column_name], threshold)
    
    max_percentile = df.loc[df[column_name] >= max_val, column_name].count() / len(df[column_name])
    min_percentile = df.loc[df[column_name] <= min_val, column_name].count() / len(df[column_name])
    
    return min_percentile, max_percentile
# ------- END OF OUTLIER SELF_MADE FUNCS ----------------------|

THRES = 3.5

outliers = outliers_summary(education_df, THRES, True, 'desc')

# Since we have a few outliers, let's get rid of them!
# We'll use the winsorize method. To choose a percentile, we'll create one out of our max/min we got with the threshold

for column in outliers.loc[outliers > 0].index:
    percentiles = get_percentiles(education_df, column, THRES) #We've been using 3.5 as a threshold
    education_df[column] = winsorize(education_df[column], percentiles)
    
outliers_summary(education_df, THRES, False, 'asc')

GRADES_4_G                      3.083110
GRADES_1_8_G                    3.083110
GRADES_ALL_G                    3.016086
GRADES_KG_G                     2.949062
GRADES_8_G                      2.882038
GRADES_9_12_G                   2.613941
GRADES_PK_G                     2.345845
GRADES_12_G                     2.278820
FEDERAL_REVENUE                 1.407507
STATE_REVENUE                   1.005362
ENROLL                          1.005362
CAPITAL_OUTLAY_EXPENDITURE      0.871314
OTHER_EXPENDITURE               0.670241
INSTRUCTION_EXPENDITURE         0.670241
TOTAL_REVENUE                   0.402145
TOTAL_EXPENDITURE               0.402145
AVG_READING_4_SCORE             0.335121
SUPPORT_SERVICES_EXPENDITURE    0.201072
AVG_READING_8_SCORE             0.134048
LOCAL_REVENUE                   0.067024
AVG_MATH_4_SCORE                0.000000
AVG_MATH_8_SCORE                0.000000
YEAR                            0.000000
dtype: float64


YEAR                            0.0
AVG_MATH_8_SCORE                0.0
AVG_MATH_4_SCORE                0.0
GRADES_ALL_G                    0.0
GRADES_9_12_G                   0.0
GRADES_1_8_G                    0.0
GRADES_12_G                     0.0
GRADES_8_G                      0.0
GRADES_4_G                      0.0
GRADES_KG_G                     0.0
AVG_READING_4_SCORE             0.0
GRADES_PK_G                     0.0
OTHER_EXPENDITURE               0.0
SUPPORT_SERVICES_EXPENDITURE    0.0
INSTRUCTION_EXPENDITURE         0.0
TOTAL_EXPENDITURE               0.0
LOCAL_REVENUE                   0.0
STATE_REVENUE                   0.0
FEDERAL_REVENUE                 0.0
TOTAL_REVENUE                   0.0
ENROLL                          0.0
CAPITAL_OUTLAY_EXPENDITURE      0.0
AVG_READING_8_SCORE             0.0
dtype: float64

In [28]:
# Now we have data with no nulls or outliers
print(education_df.info())
education_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1492 entries, 0 to 1491
Data columns (total 25 columns):
PRIMARY_KEY                     1492 non-null object
STATE                           1492 non-null object
YEAR                            1492 non-null int64
ENROLL                          1492 non-null float64
TOTAL_REVENUE                   1492 non-null float64
FEDERAL_REVENUE                 1492 non-null float64
STATE_REVENUE                   1492 non-null float64
LOCAL_REVENUE                   1492 non-null float64
TOTAL_EXPENDITURE               1492 non-null float64
INSTRUCTION_EXPENDITURE         1492 non-null float64
SUPPORT_SERVICES_EXPENDITURE    1492 non-null float64
OTHER_EXPENDITURE               1492 non-null float64
CAPITAL_OUTLAY_EXPENDITURE      1492 non-null float64
GRADES_PK_G                     1492 non-null float64
GRADES_KG_G                     1492 non-null float64
GRADES_4_G                      1492 non-null float64
GRADES_8_G                      1

Unnamed: 0,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,SUPPORT_SERVICES_EXPENDITURE,OTHER_EXPENDITURE,...,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G,AVG_MATH_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_4_SCORE,AVG_READING_8_SCORE
count,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0,...,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0
mean,2004.433646,992065.9,11088030.0,860253.2,4843120.0,5305244.0,11121330.0,5765691.0,3319725.0,489869.4,...,56115.50067,56333.514075,47654.376005,452073.5,215955.7,701836.7,232.723472,275.734024,219.00771,263.268789
std,7.393983,989911.3,11871420.0,925363.9,5000283.0,5896910.0,12004360.0,6239367.0,3474660.0,507335.6,...,62361.693756,63161.00617,53734.370864,503188.8,243229.0,728765.4,9.405215,8.754805,5.782655,4.967521
min,1992.0,43866.0,465650.0,31020.0,0.0,22093.0,481665.0,265549.0,139963.0,11541.0,...,633.0,437.0,311.0,4878.0,1808.0,7254.0,187.13467,232.83151,196.762268,238.700733
25%,1998.0,311339.8,2546261.0,217020.0,1356434.0,944796.2,2523968.0,1343611.0,767368.8,130917.4,...,10117.25,10252.0,8951.0,81273.25,39506.75,171866.5,226.267129,270.096001,216.126818,260.490867
50%,2004.0,737393.5,6351760.0,516519.0,3119414.0,2650904.0,6499891.0,3318778.0,1910412.0,330126.0,...,38280.5,38114.5,33279.0,309734.0,147235.5,459124.8,234.226904,275.711254,219.771325,262.615528
75%,2011.0,1595024.0,18036500.0,1411794.0,8020633.0,8584803.0,18017040.0,9130766.0,5435244.0,830838.5,...,72310.25,72598.25,64920.5,578340.0,284350.5,886094.8,240.163315,282.702092,222.684611,266.847503
max,2017.0,6045030.0,70953330.0,5118477.0,31005320.0,34941510.0,71227110.0,35617960.0,21693680.0,3259244.0,...,284394.0,290666.0,257151.0,2291961.0,1116572.0,3339616.0,253.420961,300.568235,236.773867,280.49913


In [30]:
education_df.head()

Unnamed: 0,PRIMARY_KEY,STATE,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,...,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G,AVG_MATH_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_4_SCORE,AVG_READING_8_SCORE
0,1992_ALABAMA,ALABAMA,1992,508401.78125,2678885.0,304177.0,1659028.0,715680.0,2653798.0,1481703.0,...,57948.0,58025.0,41167.0,471564.0,196386.0,676174.0,208.327876,252.187522,207.963517,261.925519
1,1992_ALASKA,ALASKA,1992,508401.78125,1049591.0,106780.0,720711.0,222100.0,972488.0,498362.0,...,9748.0,8789.0,6714.0,79117.0,30847.0,112335.0,211.790904,258.7769,207.088116,258.859712
2,1992_ARIZONA,ARIZONA,1992,508401.78125,3258079.0,297888.0,1369815.0,1590376.0,3401580.0,1435908.0,...,55433.0,49081.0,37410.0,437127.0,175210.0,614881.0,215.253932,265.366278,206.212716,262.169895
3,1992_ARKANSAS,ARKANSAS,1992,508401.78125,1711959.0,178571.0,958785.0,574603.0,1743022.0,964323.0,...,34632.0,36011.0,27651.0,281338.0,123113.0,405259.0,210.206028,256.31209,208.634458,264.619665
4,1992_CALIFORNIA,CALIFORNIA,1992,508401.78125,26260025.0,2072470.0,16546514.0,7641041.0,27138832.0,14358922.0,...,284394.0,290666.0,257151.0,2291961.0,1116572.0,3339616.0,208.398961,260.892247,196.764414,265.519676


In [42]:
# Now to add the weighted test score feature!
education_df['WEIGHTED_SCORE'] = ((education_df['AVG_MATH_4_SCORE'] + education_df['AVG_READING_4_SCORE']) 
                                    / (2*education_df['GRADES_4_G'])
                                + (education_df['AVG_MATH_8_SCORE'] + education_df['AVG_READING_8_SCORE']) 
                                    / (2*education_df['GRADES_8_G'])
                                * (education_df['GRADES_4_G'] + education_df['GRADES_8_G'])/2)
look_columns = ['GRADES_4_G', 'GRADES_8_G', 'AVG_MATH_4_SCORE', 'AVG_READING_4_SCORE', 'AVG_MATH_8_SCORE', 'AVG_READING_8_SCORE','WEIGHTED_SCORE']
education_df[look_columns].head()

Unnamed: 0,GRADES_4_G,GRADES_8_G,AVG_MATH_4_SCORE,AVG_READING_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_8_SCORE,WEIGHTED_SCORE
0,57948.0,58025.0,208.327876,207.963517,252.187522,261.925519,256.889554
1,9748.0,8789.0,211.790904,207.088116,258.7769,258.859712,272.960099
2,55433.0,49081.0,215.253932,206.212716,265.366278,262.169895,280.840151
3,34632.0,36011.0,210.206028,208.634458,256.31209,264.619665,255.484803
4,284394.0,290666.0,208.398961,196.764414,260.892247,265.519676,260.36694


### 2. What are the correlations between this newly created score variable and the expenditure types? Which 1 of the expenditure types is more correlated than the others?

In [53]:
# Choose which columns I want to create a correlation table out of
corr_columns = ['OTHER_EXPENDITURE','SUPPORT_SERVICES_EXPENDITURE','INSTRUCTION_EXPENDITURE','TOTAL_EXPENDITURE','WEIGHTED_SCORE']

# Make the correlation table
exp_score_corr = education_df[corr_columns].corr()

#Print the correlations for our desired outcome
print(exp_score_corr.loc['WEIGHTED_SCORE',:].drop('WEIGHTED_SCORE', axis=0))

print('\nThe SUPPORT SERVICES EXPENDITURE type is more correlated with overall score than the others')

OTHER_EXPENDITURE               0.171523
SUPPORT_SERVICES_EXPENDITURE    0.223500
INSTRUCTION_EXPENDITURE         0.200198
TOTAL_EXPENDITURE               0.195094
Name: WEIGHTED_SCORE, dtype: float64

The SUPPORT SERVICES EXPENDITURE type is more correlated with overall score than the others


### 3. Now, apply PCA to the 4 expenditure types. How much of the total variance is explained by the 1st component?

In [65]:
# Identify our feature columns
feature_columns = corr_columns
if 'WEIGHTED_SCORE' in feature_columns: feature_columns.remove('WEIGHTED_SCORE')

#Import the proper library
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Y = education_df['WEIGHTED_SCORE']
X = education_df.loc[:,feature_columns]

n_components = 1
pca = PCA(n_components=n_components)

pca.fit(X)
principal_components = pca.transform(X)

principal_df = pd.DataFrame(principal_components,
                           columns=['principal_component_{}'.format(i+1) for i in range(n_components)])

principal_df.head()

Unnamed: 0,principal_component_1
0,-9838923.0
1,-11817550.0
2,-9148989.0
3,-10914680.0
4,18874410.0


In [66]:
the_corr = Y.corr(principal_df['principal_component_1'])

print('the correlation between the target and the first principal component is {}'.format(the_corr))

the correlation between the target and the first principal component is 0.1980861103108821


### 5. If you were to choose the best variables for your model, would you prefer using the 1st principal component instead of the expenditure variables? Why?


In [73]:
avg_feature_corr = exp_score_corr.loc['WEIGHTED_SCORE',:].drop('WEIGHTED_SCORE', axis=0).sum()/(len(exp_score_corr) - 1)

print('Average correlation between features', avg_feature_corr)
print('Correlation between first principal', the_corr)

print('I would use the principal component. I compared it\'s correlation to that of the average for the features, and the principal component had a higher correlation.')
print('This leads me to believe that the prinipal component, while minimizing the dimensionality of my training set, would also eliminate some affects of noise data')

Average correlation between features 0.1975785778923553
Correlation between first principal 0.1980861103108821
I would use the principal component. I compared it's correlation to that of the average for the features, and the principal component had a higher correlation.
This leads me to believe that the prinipal component, while minimizing the dimensionality of my training set, would also eliminate some affects of noise data


# Finish it!