In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
round_dict = {1:1,
              2:2,
              4:3,
              8:4,
              16:5,
              32:6,
              64:7}

In [3]:
this_year = '2018'
last_year = str(int(this_year)-1)

In [4]:
# Read and combine last year's data
results_df = pd.read_csv('Input/march_madness_results_' + last_year + '.csv')
twitter_df = pd.read_csv('Input/march_madness_tweet_data_' + last_year + '.csv')
df = twitter_df.merge(results_df,
                      how = 'left', 
                      on = ['seed', 
                            'conference',
                            'seed_conference',
                            'team', 
                            'twitter_handle'])

In [5]:
# Add ranks for prior year Twitter data
tweet_features = ['num_favorites', 'num_followers', 'num_friends', 'num_statuses', 'retweet_avg', 'seed']
rank_features = [feature + "_rank" for feature in tweet_features]
all_features = tweet_features + rank_features
for feature in tweet_features:
    if feature != 'seed':
        df[feature + "_rank"] = df[feature].rank(ascending = False)
    else:
        df[feature + "_rank"] = df[feature].rank(ascending = True)
df['average_rank'] = df[rank_features].mean(axis = 1)

In [6]:
# Make a linear round_reached feature for reference
df['round_reached_linear'] = df.round_reached.map(round_dict)

In [7]:
# Set up features to predict
y = df.round_reached
x = df[all_features]

standard_scaler = StandardScaler()
x = standard_scaler.fit_transform(x)

In [8]:
# Fit and score model
model = LinearRegression()
model.fit(x, y)
print('Model accuracy: ', model.score(x, y))

Model accuracy:  0.600947396053


In [9]:
# Export model for use on current year data
with open('Output/march_madness_model_' + last_year + '.pickle', 'wb') as filename:
    pickle.dump(model, filename)

In [10]:
# Export features and results together for exploration
df['py_model_prediction'] = model.predict(x)
df['py_model_prediction_rank'] = df.py_model_prediction.rank()
df = df.sort_values('round_reached')
df.to_csv('Output/march_madness_predictions_' + last_year + '.csv')