In [1]:
"""
imports to analyse the data
"""
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
import numpy as np
import seaborn as sns

In [None]:
"""
visualizing the Summary_of_Weather.csv' data
"""
data_weather = pd.read_csv('Summary_of_Weather.csv')
print(data_weather.head())
print(data_weather.tail())
data_weather.info()

In [None]:
"""
visualizing the Weather_Station_locations.csv' data
"""
data_stations = pd.read_csv('Weather_Station_Locations.csv')
print(data_stations.head())
print(data_stations.tail())
data_stations.info()

In [None]:
"""
Merging the data by stations
"""
data = data_weather.merge(data_stations,left_on='STA', right_on='WBAN')
print(data.head())

In [None]:
"""
Ploting histogram of the data
"""
data.hist(bins=50, figsize=(8,6))
plt.tight_layout()
plt.show()


In [None]:
"""
ploting stations to visualize their locations
"""
data.plot(kind='scatter', x = 'Longitude', y = 'Latitude', alpha = 0.1)

In [None]:
"""
Check data correlations
"""
sns.heatmap(data.corr(), annot = True, fmt = '.2f')
corr_matrix = data.corr()
print(corr_matrix['MaxTemp'].sort_values(ascending=False))

In [None]:
"""
Eliminating low correlated data and columns without significance
"""
data = data.drop(['Date','PoorWeather', 'FT', 'FB','FTI', 'ITH', 'SD3', 'RHX','RHN','RVG', 'WTE', 'SND', 
                  'TSHDSBRSGF','STA', 'YR', 'MO', 'PGT', 'DR', 'DA', 'SPD', 'WindGustSpd', 'Precip',  'MinTemp'
                  ,'MeanTemp' , 'MaxTemp','Snowfall', 'SNF', 'PRCP', 'WBAN', 'ELEV', 'Longitude'],axis=1)

In [None]:
"""
(ALTERNATIVE TO IMPUTER)
Function to clean columns with zeros and NaN 
"""
def clean_col_NaN(data_col)
    unique_col = data_col.dropna().unique()
    uni = pd.to_numeric(unique_col, float)
    uni_mean = np.nanmean(uni)
    data_col = pd.to_numeric(data_col, float)
    data_col = data_col.fillna(uni_mean)
    return data_col

data.MIN = clean_col_NaN(data.MIN)#limpando coluna MIN
data.MAX = clean_col_NaN(data.MAX)#limpando coluna MAX
data.MEA = clean_col_NaN(data.MEA)#limpando coluna MEA
data.Latitude = clean_col_NaN(data.Latitude)#limpando coluna Latitude




In [None]:
"""
Preparing data to ML algorithms
"""

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data,
                                        test_size = 0.2,
                                        random_state=35)
print("data has {} atributes\n {} train instances\n {} test instances".format
      (len(data), len(train_set), len(test_set)))

train_X = train_set.drop(['MAX'], axis=1)
train_y = train_set.MaxTemp.copy()


In [None]:
"""
Choosing and running the model
"""
from sklearn.pipeline import FeatureUnion

from future_encoders import OneHotEncoder

num_atribs = list(train_X.columns)

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(train_X, train_y)


some_data = train_x.iloc[:5]
some_labels = train_y.iloc[:5]

print("Predictions: ", lin_reg.predict(some_data))

from sklearn.metrics import mean_squared_error

predictions = lin_reg.predict(some_data)
lin_mse = mean_squared_error(some_labels, predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)