In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display


# Input

## Read inputs

In [None]:
df_train = pd.read_csv('inputs/train.csv')
df_test = pd.read_csv('inputs/test.csv')
display(df_train.shape)
display(df_test.shape)


## Preprocessing

### Concat train test

In [None]:
# fill 'Weather' of test data to 0 for convenience
df_test['Weather'] = np.zeros((len(df_test),))

# let train_end_idx be border of 'train.csv' and 'test.csv'
train_end_idx = len(df_train)

# combine into single dataframe
df: pd.DataFrame
df = pd.concat([df_train, df_test], sort=False)


### Before processing

In [None]:
display(df.shape)
display(df.head())
display(df.describe())

### Process columns

In [None]:
# # drop object type columns
# df = df.drop(columns = [col for col in df.columns if df[col].dtype == object])
# # fill missing value with 0
# df = df.fillna(0)

# show object type columns
obj_columns = [col for col in df.columns if df[col].dtype == object]
print('object type columns:', obj_columns)

# transform date to year, month, day
df['Year'] = pd.to_datetime(df['Date']).dt.year
df['Month'] = pd.to_datetime(df['Date']).dt.month
df['Day'] = pd.to_datetime(df['Date']).dt.day
df = df.drop(columns=['Date'])

# encode locations
df['Loc'] = df['Loc'].rank(method='dense', ascending=False)

# encode directions
directions = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']
dir_map = {}
for idx, dir in zip(range(len(directions)), directions):
    dir_map[dir] = idx * 22.5
df['WindDir'] = df['WindDir'].map(dir_map)
df['DayWindDir'] = df['DayWindDir'].map(dir_map)
df['NightWindDir'] = df['NightWindDir'].map(dir_map)

# fill missing value
df = df.fillna(df.median(axis=0))

# normalize
cols_to_norm = df.columns.drop('Weather')
# min max scale
df[cols_to_norm] = (df[cols_to_norm] - df[cols_to_norm].min(axis=0)) / (df[cols_to_norm].max(axis=0) - df[cols_to_norm].min(axis=0))


In [None]:
display(df.shape)
display(df.head())
display(df.describe())

### Split train test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    df.drop(columns=['Weather']).values[:train_end_idx, :],
    df['Weather'].values[:train_end_idx], test_size=0.5)

X_test = df.drop(columns=['Weather']).values[train_end_idx:, :]


In [None]:
display(y_train.sum())

display(X_train.shape)
display(X_val.shape)
display(y_train.shape)
display(y_val.shape)

display(X_test.shape)


### Resampling

In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE, SMOTENC
from imblearn.combine import SMOTEENN, SMOTETomek

resampler = SMOTEENN(random_state=0)
# resampler = SMOTENC(categorical_features=[0, 1])
X_train, y_train = resampler.fit_resample(X_train, y_train)

In [None]:
display(y_train.sum())

display(X_train.shape)
display(y_train.shape)


# Training

In [None]:
from sklearn.linear_model import LinearRegression


class ModelClass:
    def __init__(self) -> None:
        self.model = LinearRegression()

    def fit(self, X, y) -> None:
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)


In [None]:
from sklearn.tree import DecisionTreeClassifier


class DecisionTree(ModelClass):
    def __init__(self) -> None:
        self.model = DecisionTreeClassifier(criterion='entropy', random_state=0)


In [None]:
from sklearn.svm import SVC

class SVM(ModelClass):
    def __init__(self) -> None:
        # self.model = SVC(kernel='linear', random_state=0, probability=True)
        self.model = SVC(kernel='rbf' ,C=2 ,random_state=0, probability=True) 

In [None]:
# train tree model
model = SVM()
model.fit(X_train, y_train)

# predict
y_pred_decision = model.predict(X_val)


## Get score

In [None]:
from sklearn.metrics import accuracy_score, f1_score

print('Accuracy: %f' % accuracy_score(y_val, y_pred_decision))
print('f1-score: %f' % f1_score(y_val, y_pred_decision))


# Output

In [None]:
ans_pred = model.predict(X_test)
df_sap = pd.DataFrame(ans_pred.astype(int), columns=['Weather'])
df_sap.to_csv('outputs/prediction.csv',  index_label='Id')
