In [None]:
%matplotlib inline
import matplotlib.pyplot as pyplot
import numpy
import pandas
import seaborn
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
raw_mission_dataset = pandas.read_csv('/Users/joshfrazier/data/operations.csv')
raw_weather_dataset = pandas.read_csv('/Users/joshfrazier/data/weather_data/weather_summary.csv')

In [None]:
raw_weather_dataset.describe()

In [None]:
raw_weather_dataset.plot(x='MinTemp', y='MaxTemp', style='o')
pyplot.title('MinTemp vs MaxTemp')
pyplot.xlabel('MinTemp')
pyplot.ylabel('MaxTemp')
pyplot.show()

In [None]:
# Check the average max temperature.
pyplot.figure(figsize=(15, 20))
pyplot.tight_layout()
seaborn.distplot(raw_weather_dataset['MaxTemp'])

In [None]:
# Divide the data into "attributes" and "labels".
# The attributes are the independent variables, while the labels are dependent variables whose values will be predicted.
# We want to predict the MaxTemp based on the observed MinTemp.
X = raw_weather_dataset['MinTemp'].values.reshape(-1, 1)
y = raw_weather_dataset['MaxTemp'].values.reshape(-1, 1)

In [None]:
# Split 80% of the data to the training set, 20% to test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
regressor = LinearRegression()
# Train the algorithm.
regressor.fit(X_train, y_train)

In [None]:
# The linear regression model finds the best value for the intercept and slope, which results in a line that best fits the data.
# For every one unit of change in Min temperature, the change in the Max temperature is about 0.92%.
print(regressor.intercept_)
print(regressor.coef_)

In [None]:
# Make predictions based on the test data.
y_prediction = regressor.predict(X_test)

y_dataframe = pandas.DataFrame({'Actual':y_test.flatten(), 'Predicted': y_prediction.flatten()})
y_dataframe

In [None]:
# Visualize the comparison result as a bar graph.
df1 = y_dataframe.head(25)
df1.plot(kind='bar', figsize=(16, 10))
pyplot.grid(which='major', linestyle='-', linewidth='0.5', color='green')
pyplot.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
pyplot.show()

In [None]:
# Plot the "fitted" line against the dataset
pyplot.scatter(X_test, y_test, color='gray')
pyplot.plot(X_test, y_prediction, color='red', linewidth=2)
pyplot.show()