In [1]:
# Import necessary packages and modules
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Gather data
df = pd.read_csv('data_participant_ECCC.csv')
df.head()

  df = pd.read_csv('data_participant_ECCC.csv')


Unnamed: 0,ID,year,month,day,hour,start_time,latitude,longitude,event,r_300.t-6,...,LD.t-2,LD.t-1,LD.t+0,LD.t+1,LD.t+2,LD.t+3,hail_size,y_thunderstorm,y_hail,y_severe
0,0,2005,4,8,4,4/8/2005 4:00,56.129,-119.08198,,98.769104,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False
1,1,2005,4,8,8,4/8/2005 8:00,52.129,-119.08198,,100.73672,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False
2,2,2005,4,13,2,4/13/2005 2:00,48.379,-114.33179,,33.332287,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False
3,3,2005,4,21,2,4/21/2005 2:00,53.879,-105.831436,,83.93927,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False
4,4,2005,5,3,21,5/3/2005 21:00,52.879,-115.83185,,95.18686,...,4.8e-05,0.000451,0.001993,0.004847,0.000247,0.001017,0.0,True,False,False


In [3]:
# Split data into two (one with missing y values and one without)
missing_y = df['y_thunderstorm'].isna() & df['y_hail'].isna() & df['y_severe'].isna()
df_predict = df[missing_y]
df_remaining = df[~missing_y]

In [4]:
# Split df_remaining into data frames that are/are not missing hail_size values
missing_hail_size = df_remaining['hail_size'].isna()
df_wo_hail_size = df_remaining[missing_hail_size]
df_cleaned = df_remaining[~missing_hail_size]

In [5]:
# Isolate outputs with values
y1 = df_cleaned['y_thunderstorm']
y2 = df_cleaned['y_hail']
y3 = df_cleaned['y_severe']

print(f'Duplicates in cleaned data: {df_cleaned.duplicated().sum()}')
print(y1.value_counts())
print(y2.value_counts())
print(y3.value_counts())

Duplicates in cleaned data: 0
y_thunderstorm
True     5783
False    3249
Name: count, dtype: int64
y_hail
True     5516
False    3516
Name: count, dtype: int64
y_severe
False    5506
True     3526
Name: count, dtype: int64


In [6]:
# Train models for y_thunderstorm, y_hail, y_severe
model1 = DecisionTreeClassifier()
X_train, X_test, y1_train, y1_test = train_test_split(df_cleaned.drop(columns=['ID', 'year', 'month', 'day', 'hour', 'start_time', 'latitude', 'longitude', 'event', 'y_thunderstorm', 'y_hail', 'y_severe']), y1.astype(int), test_size=0.2, random_state=42)
model1.fit(X_train, y1_train)

predictions1 = model1.predict(X_test)
score1 = accuracy_score(y1_test, predictions1)

model2 = DecisionTreeClassifier()
X_train, X_test, y2_train, y2_test = train_test_split(df_cleaned.drop(columns=['ID', 'year', 'month', 'day', 'hour', 'start_time', 'latitude', 'longitude', 'event', 'y_thunderstorm', 'y_hail', 'y_severe']), y2.astype(int), test_size=0.2)
model2.fit(X_train, y2_train)

predictions2 = model2.predict(X_test)
score2 = accuracy_score(y2_test, predictions2)

model3 = DecisionTreeClassifier()
X_train, X_test, y3_train, y3_test = train_test_split(df_cleaned.drop(columns=['ID', 'year', 'month', 'day', 'hour', 'start_time', 'latitude', 'longitude', 'event', 'y_thunderstorm', 'y_hail', 'y_severe']), y3.astype(int), test_size=0.2)
model3.fit(X_train, y3_train)

predictions3 = model3.predict(X_test)
score3 = accuracy_score(y3_test, predictions3)

print(f"Accuracy for y_thunderstorm: {score1*100:.2f}%")
print(f"Accuracy for y_hail: {score2*100:.2f}%")
print(f"Accuracy for y_severe: {score3*100:.2f}%")
print(f"Overall accuracy: {(score1 + score2 + score3)/3*100:.2f}%")

Accuracy for y_thunderstorm: 95.07%
Accuracy for y_hail: 94.80%
Accuracy for y_severe: 95.02%
Overall accuracy: 94.96%


In [7]:
# Predict missing values of y_thunderstorm, y_hail, y_severe
y1_predict = model1.predict(df_predict.drop(columns=['ID', 'year', 'month', 'day', 'hour', 'start_time', 'latitude', 'longitude', 'event', 'y_thunderstorm', 'y_hail', 'y_severe'])).astype(bool)
y2_predict = model2.predict(df_predict.drop(columns=['ID', 'year', 'month', 'day', 'hour', 'start_time', 'latitude', 'longitude', 'event', 'y_thunderstorm', 'y_hail', 'y_severe'])).astype(bool)
y3_predict = model3.predict(df_predict.drop(columns=['ID', 'year', 'month', 'day', 'hour', 'start_time', 'latitude', 'longitude', 'event', 'y_thunderstorm', 'y_hail', 'y_severe'])).astype(bool)

In [8]:
# Replace missing y-values with predictions
df_predict_new = df_predict.drop(columns=['y_thunderstorm', 'y_hail', 'y_severe'])
df_predict_new['y_thunderstorm'] = y1_predict
df_predict_new['y_hail'] = y2_predict
df_predict_new['y_severe'] = y3_predict

In [9]:
# Combine all sub data frames into complete final data frame
df_final = pd.concat([df_predict_new, df_wo_hail_size, df_cleaned], ignore_index=True)

In [10]:
# Save .csv file, to be submitted for evaluation
df_final.to_csv('output_ECCC.csv', index=False)