In [39]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

# Helper Functions

These functions train a model on your data and use cross validation to determine the average negative mean squared error. You do not have to edit these! However, it might make sense to add new helper functions here.

In [40]:
def scoreLinear(X, y):
    """Trains a linear regression model and evaluates it with 3-fold cross validation.
  
    Parameters: 
    X (array): Training data of shape (n_samples, n_features)
    y (array): Target values of shape (n_samples,)
  
    Returns: 
    Float: average score of the model after cross validation
    """
    regr = linear_model.LinearRegression()
    scores = cross_val_score(regr, X, y, cv=3, scoring='neg_mean_squared_error')
    return scores.mean()

In [41]:
def scoreBoosting(X, y):
    """Trains a gradient boosting model and evaluates it with 3-fold cross validation.
  
    Parameters: 
    X (array): Training data of shape (n_samples, n_features)
    y (array): Target values of shape (n_samples,)
  
    Returns: 
    Float: average score of the model after cross validation
    """
    clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=0, loss='squared_error')
    # clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=0, loss='ls') # older versions

    scores = cross_val_score(clf, X, np.ravel(y), cv=3, scoring='neg_mean_squared_error')
    return scores.mean()

In [42]:
def dataframeToXy(df, predict_column, feature_columns):
    """Convert the dataframe to a format usable for the ML algorithms"""
    X = df[feature_columns].values.reshape(-1, df[feature_columns].shape[1]) # all features
    y = df[[predict_column]].values.reshape(-1, 1) # values to predict
    return X, y

In [43]:
def runScoring(df, predict_column, feature_columns):
    """This runs both algorithms to determine the scores for all given features in the dataset.
    
    Parameters: 
    df (dataframe): dataframe with Score and features
    predict_column: name of the value to predict, e.g. 'Score'
    feature_columns: list of all column names that are to be used as features
  
    Returns: 
    linear: negative mean squared error of linear regression
    boost: negative mean squared error of boosting algorithm
    """
    X, y = dataframeToXy(df, predict_column, feature_columns)
    boost = scoreBoosting(X, y)
    linear = scoreLinear(X, y)
    return linear, boost

In [44]:
def runScoringSimple(df, predict_column):
    """Alternative to function above, this takes all the columns in the 
    dataframe as features except the single column which is used for the score.
    """
    all_feature_columns = list(df.columns)
    all_feature_columns.remove(predict_column)
    return runScoring(df, predict_column, all_feature_columns)

In [45]:
def createBaselineRandom(df, predict_column):
    """creates random features and returns the average score after 20 rounds"""
    np.random.seed(0)
    baseline = df[['Score']].copy()
    averageLinearScore = 0
    averageBoostingScore = 0
    rounds = 20
    for i in range(rounds):
        baseline['feature1'] = np.random.randint(0, 100, df.shape[0])
        baseline['feature2'] = np.random.randint(0, 100, df.shape[0])
        baseline['feature3'] = np.random.randint(0, 100, df.shape[0])
        lin, boost = runScoring(baseline, predict_column, ['feature1', 'feature2', 'feature3'])
        averageLinearScore += lin / rounds
        averageBoostingScore += boost / rounds
    print(f'linear   : {averageLinearScore}')
    print(f'boosting : {averageBoostingScore}')

# Load Data


Recommended: Make sure you have a column named 'Score', preferably as the first column, while the name of the municipality is the index. It is fine to use your previous notebook from assignment 1 to export the data to a .csv or excel and simply import the file here, you do not have to show the code for that. Your initial dataframe should look somewhat like the one below, with more features of course.

In [46]:
def load_df():
    ranking_df = pd.read_csv('./data/ranking.csv')
    regio_df = pd.read_csv('./data/regionalportraits.csv')
    return pd.merge(ranking_df, regio_df, on='Gemeinde', how='inner').set_index('Gemeinde') 

df = load_df()

df

Unnamed: 0_level_0,Score,Gemeindecode,Einwohner,Einwohner_Veraenderung_Prozent,Bevoelkerungsdichte_km2,Auslaender_Prozent,Alter_0_19_Jahre,Alter_20_64_Jahre,Alter_65_Plus_Jahre,Heiratsziffer,...,Beschaeftigte_Sektor1,Beschaeftigte_Sektor2,Beschaeftigte_Sektor3,Arbeitsstaetten_Total,Arbeitsstaetten_Sektor1,Arbeitsstaetten_Sektor2,Arbeitsstaetten_Sektor3,Leerwohnungsziffer,Neubauwohnungen_pro_1000_Einwohner,Sozialhilfequote
Gemeinde,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Furna,77.7,3862,202.0,0.497512,6.062425,3.465347,24.752475,52.475248,22.772277,0.000000,...,56,10,19,36,19,7,10,1.153846,14.084507,0.00
Sufers,74.9,3695,145.0,12.403101,4.188330,10.344828,26.896552,46.206897,26.896552,0.000000,...,30,25,30,26,8,4,14,0.909091,0.000000,0.00
Safiental,65.7,3672,903.0,-4.746835,5.963545,4.318937,20.265781,54.152824,25.581395,5.543237,...,188,61,188,151,72,14,65,0.258732,2.209945,0.00
Flerden,64.8,3662,247.0,7.860262,40.558292,4.048583,27.935223,53.441296,18.623482,0.000000,...,40,5,29,32,14,4,14,3.205128,23.622047,0.00
Valsot,62.4,3764,841.0,-9.763948,5.290639,7.253270,18.549346,55.291320,26.159334,3.537736,...,108,124,158,113,37,24,52,3.472222,8.149010,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Chur,36.9,3901,37082.0,5.179260,683.665192,20.298258,16.212718,62.442695,21.344588,4.999189,...,143,4019,28858,3813,38,373,3402,1.251926,7.438197,2.85
Laax,36.8,3575,1885.0,40.044577,59.444970,19.734748,14.535809,62.493369,22.970822,6.411969,...,24,123,1298,179,11,29,139,2.725681,20.512821,0.00
Fürstenau,36.5,3633,357.0,2.000000,270.454545,10.924370,20.728291,55.742297,23.529412,2.816901,...,13,0,169,0,4,0,22,1.005025,5.730659,0.00
Silvaplana,34.0,3790,1132.0,15.746421,25.284789,34.893993,13.339223,60.954064,25.706714,4.458315,...,13,58,851,139,4,12,123,0.804505,15.219338,0.00


These are the features that were used to create the original ranking in Assignment 1. Make sure these features are not in your dataset from now on. 

**Original Features:**

- forest_score
- family_score
- wiki_score
- accident_score
- street_score


# Get a first baseline for your model with random values

In [47]:
# This code creates a baseline for your model, using random features
createBaselineRandom(df, 'Score')

linear   : -123.77997079595366
boosting : -144.13316606570183


**WARNING**: This uses the same dataframe for both the linear model and the boosting model. In your code you will want to use different ones for the different models. Your goal is to get both of these scores to as close as 0 as possible and most likely you will need to do different steps for linear regression and for gradient boosting and therefore have different features. 

# Get a second baseline by using all features you have

In [48]:
# this assumes we have a dataframe with a column named Score and all other columns are features.
# You might have to change this code if your dataframe looks different.

linear, boost = runScoringSimple(df, 'Score')
print(f'linear   : {linear}')
print(f'boosting : {boost}')

linear   : -88.86384751874573
boosting : -61.653726956689525


# Extend your features and add something of a geographic nature

Take the features you have loaded above and potentially extend it with the data from the gemeindeporträts 2021. Now you can extend this with some kind of geographic information. This can be pretty much anything that uses coordinates, height, relative position to other features etc. Try to find something reasonable that might help you, but don't worry if you find out during your feature engineering that it is not helpful, you are not required to use the feature for your final model.

In [49]:
# TODO: Insert your code to get a geographic feature here

# Do your own feature engineering here

Your goal is to find the right features to get both scores as close to 0 as possible. Add more columns to your dataframe through feature engineering and choose which ones of those you will use for the machine learning models. Use the methods discussed in class to improve your results even further.

In [50]:
#TODO: Insert your code here 