In [2]:
import pandas as pd
import numpy as np
import pylab 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stat
import imblearn
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import matplotlib
#import shap
from sklearn.preprocessing import StandardScaler, OrdinalEncoder,LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_predict,cross_validate, cross_val_score,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn import metrics
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score,roc_curve
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import FunctionTransformer
from datetime import datetime, timedelta
from scipy.stats import zscore
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE

In [3]:
train_data= pd.read_csv("D:/portfolio project 25-1/widsdatathon2023/Data/train_data.csv")
test_data=pd.read_csv("D:/portfolio project 25-1/widsdatathon2023/Data/test_data.csv")
print(train_data.info())
print(train_data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375734 entries, 0 to 375733
Columns: 246 entries, index to wind-vwnd-925-2010-20
dtypes: float64(240), int64(4), object(2)
memory usage: 705.2+ MB
None
               index            lat            lon  \
count  375734.000000  375734.000000  375734.000000   
mean   187866.500000       0.592766       0.517964   
std    108465.207359       0.251744       0.272059   
min         0.000000       0.000000       0.000000   
25%     93933.250000       0.409091       0.300000   
50%    187866.500000       0.590909       0.533333   
75%    281799.750000       0.818182       0.766667   
max    375733.000000       1.000000       1.000000   

       contest-pevpr-sfc-gauss-14d__pevpr  nmme0-tmp2m-34w__cancm30  \
count                       375734.000000             375734.000000   
mean                           276.744064                 10.801682   
std                            198.085182                 10.952417   
min                        

In [4]:
for i in train_data.columns:
    if i not in test_data.columns:
        target = i
target

'contest-tmp2m-14d__tmp2m'

In [5]:
train_data.startdate = pd.to_datetime(train_data.startdate)
test_data.startdate = pd.to_datetime(test_data.startdate)

In [6]:
def rmse(actual, predicted):
    return mean_squared_error(actual, predicted, squared=False)

def location_nom(train, test):
    # Ref: https://www.kaggle.com/code/flaviafelicioni/wids-2023-different-locations-train-test-solved
    scale = 14

    train.loc[:,'lat']=round(train.lat,scale)
    train.loc[:,'lon']=round(train.lon,scale)
    test.loc[:,'lat']=round(test.lat,scale)
    test.loc[:,'lon']=round(test.lon,scale)

    all_df = pd.concat([train, test], axis=0)
    all_df['loc_group'] = all_df.groupby(['lat','lon']).ngroup()
    train = all_df.iloc[:len(train)]
    test = all_df.iloc[len(train):].drop(target, axis=1)
    
    return train, test

def categorical_encode(train, test):
    le = LabelEncoder()
    train['climateregions__climateregion'] = le.fit_transform(train['climateregions__climateregion'])
    test['climateregions__climateregion'] = le.transform(test['climateregions__climateregion'])
    return train, test
    
def fill_na(df):
    df = df.sort_values(by=['loc_group', 'startdate']).ffill()
    return df

def creat_new_featute(df):
    #df['year'] = df['startdate'].dt.year
    df['month'] = df['startdate'].dt.month
    df['day_of_year'] = df['startdate'].dt.dayofyear
    df['quarter'] = df.startdate.dt.quarter
    df['week'] = df.startdate.dt.weekofyear
    return df

def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

def encode_cyclical(df):
    # encode the day with a period of 365
    df['day_of_year_sin'] = sin_transformer(365).fit_transform(df['day_of_year'])
    df['day_of_year_cos'] = cos_transformer(365).fit_transform(df['day_of_year'])

    # encode the week with a period of 52
    df['week_sin'] = sin_transformer(52).fit_transform(df['week'])
    df['week_cos'] = cos_transformer(52).fit_transform(df['week'])

    # encode the month with a period of 12
    df['month_sin'] = sin_transformer(12).fit_transform(df['month'])
    df['month_cos'] = cos_transformer(12).fit_transform(df['month'])
    
    # encode the quarter with a period of 4
    df['quarter_sin'] = sin_transformer(4).fit_transform(df['quarter'])
    df['quarter_cos'] = cos_transformer(4).fit_transform(df['quarter'])
    return df 

def identify_correlated(df, threshold):
    corr_matrix = df.corr().abs()
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    reduced_corr_matrix = corr_matrix.mask(mask)
    features_to_drop = [c for c in reduced_corr_matrix.columns if any(reduced_corr_matrix[c] > threshold)]
    return features_to_drop

def feature_engineering(train_raw, test_raw):
    train, test = location_nom(train_raw, test_raw)
    train = fill_na(train)
    train = creat_new_featute(train)
    test = creat_new_featute(test)
    train, test = categorical_encode(train, test)
    train = encode_cyclical(train)
    test = encode_cyclical(test)

    drop_cols = identify_correlated(train, 0.96)
    drop_cols = drop_cols + ['index', 'startdate', 'lat', 'lon', target,'month', 'week', 'quater', 'day_of_year']
    features = [col for col in train.columns if col not in drop_cols]
    X = train[features]
    X_test = test[features]
    y = train[target]

    return X, y, X_test

In [7]:
X, y, X_test = feature_engineering(train_data.copy(), test_data.copy())
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)
#print(f'Train_shape: {X_train.shape}    |   Val_shape: {X_val.shape}    |   Test_shape: {X_test.shape}')

In [8]:
sample_sollution=pd.read_csv('D:/portfolio project 25-1/widsdatathon2023/Data/sample_solution.csv')
y_test=sample_sollution[target]