In [3]:
#Obtain relevant packages
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [4]:
#Import relevant data: climbing_statistics by date and Rainier_Weather by date
df1 = pd.read_csv('Datasets\climbing_statistics.csv')
df2 = pd.read_csv('Datasets\Rainier_Weather.csv')
print(df1.shape)
print(df2.shape)

(4077, 5)
(464, 7)


In [5]:
#Display climbing_statistics
df1.head(200)

Unnamed: 0,Date,Route,Attempted,Succeeded,Success Percentage
0,11/27/2015,Disappointment Cleaver,2,0,0.000000
1,11/21/2015,Disappointment Cleaver,3,0,0.000000
2,10/15/2015,Disappointment Cleaver,2,0,0.000000
3,10/13/2015,Little Tahoma,8,0,0.000000
4,10/9/2015,Disappointment Cleaver,2,0,0.000000
...,...,...,...,...,...
195,8/20/2015,Disappointment Cleaver,2,0,0.000000
196,8/20/2015,Disappointment Cleaver,11,6,0.545455
197,8/20/2015,Disappointment Cleaver,12,9,0.750000
198,8/20/2015,Disappointment Cleaver,2,0,0.000000


In [6]:
#Display Rainier_Weather
df2.head(200)

Unnamed: 0,Date,Battery Voltage AVG,Temperature AVG,Relative Humidity AVG,Wind Speed Daily AVG,Wind Direction AVG,Solare Radiation AVG
0,12/31/2015,13.845000,19.062917,21.870833,21.977792,62.325833,84.915292
1,12/30/2015,13.822917,14.631208,18.493833,3.540542,121.505417,86.192833
2,12/29/2015,13.834583,6.614292,34.072917,0.000000,130.291667,85.100917
3,12/28/2015,13.710417,8.687042,70.557917,0.000000,164.683750,86.241250
4,12/27/2015,13.362500,14.140417,95.754167,0.000000,268.479167,31.090708
...,...,...,...,...,...,...,...
195,6/18/2015,13.580833,33.132917,74.277917,29.122917,112.528750,244.179292
196,6/17/2015,13.509167,40.297083,39.140000,9.701375,19.752917,358.033292
197,6/16/2015,13.566250,38.616667,33.182917,9.260917,28.182083,352.221167
198,6/15/2015,13.550000,35.513750,64.327500,7.011958,29.117083,343.898000


In [9]:
#Zip datasets into 1 training set based on date.
#Allows us to take training and target sets from the same dataframe
df = pd.concat([df1,df2], sort = False)
print(df.shape)
cols = df.columns
agg_funct = {'Date': 'first', 'Route': 'first', 'Attempted': 'first', 'Succeeded': 'first', 'Success Percentage': 'first',
       'Battery Voltage AVG': 'first', 'Temperature AVG': 'first', 'Relative Humidity AVG': 'first',
       'Wind Speed Daily AVG': 'first', 'Wind Direction AVG': 'first', 'Solare Radiation AVG': 'first'}
df_new = df.groupby(df['Date']).agg(agg_funct)
df_new

(4541, 11)


Unnamed: 0_level_0,Date,Route,Attempted,Succeeded,Success Percentage,Battery Voltage AVG,Temperature AVG,Relative Humidity AVG,Wind Speed Daily AVG,Wind Direction AVG,Solare Radiation AVG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1/1/2015,1/1/2015,,,,,13.804167,29.230000,51.766667,6.537208,240.454583,0.000000
1/10/2015,1/10/2015,,,,,13.701667,22.923750,73.931250,20.243625,182.509458,0.000000
1/11/2015,1/11/2015,,,,,13.536667,15.422500,89.992500,8.212958,188.286833,0.000000
1/12/2015,1/12/2015,,,,,13.695000,19.603333,74.555000,3.946417,220.268750,0.216458
1/13/2015,1/13/2015,,,,,13.770833,29.147500,32.265417,2.269917,276.374458,0.005917
...,...,...,...,...,...,...,...,...,...,...,...
9/7/2015,9/7/2015,Disappointment Cleaver,2.0,0.0,0.00,13.461250,35.252917,48.962917,3.895250,235.496667,248.545500
9/8/2014,9/8/2014,Disappointment Cleaver,8.0,8.0,1.00,,,,,,
9/8/2015,9/8/2015,Disappointment Cleaver,2.0,0.0,0.00,13.481667,43.759583,40.110417,7.346542,200.003333,243.235708
9/9/2014,9/9/2014,Disappointment Cleaver,12.0,9.0,0.75,,,,,,


In [15]:
#Some dates only have info on one or the other dataset. 
#To improve accuracy, in the future only rows without success info will be dropped
#and weather information will be interpolated from adjacent days.
df_new = df_new.dropna()
df_new.head(50)
df_new.shape


(204, 11)

In [16]:
#Regression to determine success percentage based on weather for the day
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [17]:
#Degree = #of features +1
poly = PolynomialFeatures(degree = 4)
X_train_df = df_new[['Temperature AVG', 'Wind Speed Daily AVG', 'Solare Radiation AVG',]]
y_train_df = df_new['Success Percentage']

#Train test split to fit the model to our dataset and obtain standards for testing
X_train, X_test, y_train, y_test = X_train_df[:150], X_train_df[150:], y_train_df[:150], y_train_df[150:]

In [18]:
#Fit our data to polynomial regression. This adds values representing interactions between input features
poly.fit(X_train)
X_train_poly = poly.transform(X_train)


In [19]:
#Import linear regression model
model = LinearRegression()

In [20]:
#Train model based on our fitted polynomial features
model.fit(X_train_poly, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
#Fit test set to polynomial
X_test_poly = poly.transform(X_test)

In [22]:
#Predict y values based on test set
y_pred = model.predict(X_test_poly)


In [23]:
#Obtain model score using 
model.score(X_test_poly, y_test)

-10.466696797181523

In [25]:
from sklearn import metrics
error = metrics.mean_squared_error(y_test, y_pred)
print(error)

1.6977898015553552


In [None]:
#Work tbd:
"""
Work to be completed:
    1. Interpolate weather data to expand training set
    2. Utilize other models for regression such as SGD
    3. Reformulate targets as "success likely" or "success unlikely" for implementation of classifier methods
    4. Implement Pipeline for efficiency in implementing new models
    5. Evaluate new models based on F1 score, Confusion Matrix, and cross validation

"""