<a href="https://colab.research.google.com/github/dsmondo/jh-faces/blob/main/semiproject_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ANOVA

In [None]:
# Test for differences in the mean of the variable (analysis of variance)
# One-way ANOVA (Analysis of Variance)
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
import pingouin as pg

# num_cols - Charger_type
def levene_anova(col):

    target_list = [0, 1, 2, 3]

    zero = df.loc[df['Charger_type'] == 0,col]
    slow = df.loc[df['Charger_type'] == 1,col]
    fast = df.loc[df['Charger_type'] == 2,col]
    both = df.loc[df['Charger_type'] == 3,col]

    levene = stats.levene(zero, slow, fast, both)

    # levene-test
    if levene[1] < 0.05:
        print("MESSAGE: At least one of the variances among the groups is different.")
        # Welch's ANOVA since H0 is rejected
        welch = pg.welch_anova(dv=col, between='Charger_type', data=df)

        if welch['p-unc'].item() < 0.05:
            print(f'MESSAGE: Reject the null hypothesis that the {col} are equal between the 7 groups')

            # post-hoc test
            mc = MultiComparison(data=df[col], groups=df['Charger_type'])
            tukeyhsd = mc.tukeyhsd(alpha=0.05)
            fig = tukeyhsd.plot_simultaneous()

            print(tukeyhsd.summary())

        else:
            print(f'MESSAGE: Accept the null hypothesis that the {col} are equal between the 7 groups')

    else:
        print("MESSAGE: All groups have equal variances.")
        # ANOVA since H0 is accepted
        anova = stats.f_oneway(zero, slow, fast, both)

        if anova[1] < 0.05:
            print(f'MESSAGE: Reject the null hypothesis that the {col} are equal between the 7 groups')
            mc = MultiComparison(data=df[col], groups=df['Charger_type'])
            tukeyhsd = mc.tukeyhsd(alpha=0.05)
            fig = tukeyhsd.plot_simultaneous()

            print(tukeyhsd.summary())

        else:
             print(f'MESSAGE: Accept the null hypothesis that the {col} are equal between the 7 groups')

# Correlation Heatmap

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

font_path = "/MultiCampus/Malgun.ttf"
font_name = fm.FontProperties(fname=font_path).get_name()
rc('font', family=font_name)

corr = final_df[['Gas_station_count', 'Charger_ACC', 'Performance_facility_ACC', 'Land_Price', 'Library_ACC',
                'Hospital_ACC', 'Healthcare_Facility_ACC', 'Community_Park_ACC', 'Fire_Station_ACC', 'Population',
                'Theme_Park_ACC', 'Parking_lot_ACC', 'Sports_Facility_ACC', 'Elementary_School_ACC', 'Farmland',
                'Stream', 'Altitude', 'Charger_type']]

plt.figure(figsize=(15, 8))
sns.heatmap(corr.corr(), annot=True, cmap='viridis', vmax=1, vmin=-1)

plt.show()

# Generation of Train dataset

In [None]:
import pandas as pd
# Read the CSV file
df = pd.read_csv('intersection_ratio.csv', encoding='cp949')

# Drop unnecessary columns
df.drop(['fid', 'emd_cd', 'emd_nm_k', 'no_charging', 'centerpoint', 'centerpoint_2', 'area'], axis=1, inplace=True)

# Columns to multiply
columns_to_multiply = ['Charger_ACC', 'Gas_station_count', 'Performance_facility_ACC', 'Land_Price', 'Library_ACC',
                       'Hospital_ACC', 'Healthcare_Facility_ACC', 'Community_Park_ACC', 'Fire_Station_ACC', 'Theme_Park_ACC',
                       'Parking_lot_ACC', 'Sports_Facility_ACC', 'Elementary_School_ACC', 'Population']

# Values multiplied by area ratio
for column in columns_to_multiply:
    df[column] = df[column] * df['area_ratio']

# Round and convert 'Population' and 'Gas_station_count' columns to integers
df['Population'] = df['Population'].round().astype(int)
df['Gas_station_count'] = df['Gas_station_count'].round().astype(int)

# Create a new DataFrame with selected columns
columns_to_multiply2 = ['rand_point_id', 'Charger_ACC', 'Gas_station_count', 'Performance_facility_ACC', 'Land_Price', 'Library_ACC',
                        'Hospital_ACC', 'Healthcare_Facility_ACC', 'Community_Park_ACC', 'Fire_Station_ACC', 'Theme_Park_ACC',
                        'Parking_lot_ACC', 'Sports_Facility_ACC', 'Elementary_School_ACC', 'Population']
df1 = df[columns_to_multiply2]

# Group by 'rand_point_id' and sum the values
df1_sum = df1.groupby('rand_point_id').sum()

# Group by 'rand_point_id' and check if 'farmland' is included
df2 = df.groupby('rand_point_id')['farmland'].apply(lambda x: int(1 in x.values)).reset_index(name='farmland_YN')

# Group by 'rand_point_id' and check if 'stream' is included
df3 = df.groupby('rand_point_id')['stream'].apply(lambda x: int(1 in x.values)).reset_index(name='stream_YN')

# Merge df1_sum and df2 on 'rand_point_id'
merged_df = pd.merge(df1_sum, df2, on='rand_point_id', how='inner')

# Merge merged_df and df3 on 'rand_point_id'
final_merged_df = pd.merge(merged_df, df3, on='rand_point_id', how='inner')

# Print the final merged DataFrame
final_merged_df


# Machine Learning

In [None]:
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE

# Load data
train = pd.read_csv("/content/drive/MyDrive/naju/train_data.csv")
test = pd.read_csv("/content/drive/MyDrive/naju/test_data.csv")

# Align column names and select necessary columns
# train.rename(columns={'급/완속여부_new': '급/완속여부', '평균고도': '고도'}, inplace=True)
# test.rename(columns={'평균고도': '고도'}, inplace=True)

y_train = train['Charger_type']
y_test = test['Charger_type']
X_train = train.drop(columns=['Charger_type', 'Unnamed: 0', 'rand_point_id', 'Charger_ACC', 'Gas_station_count'])
X_test = test.drop(columns=['Charger_type', 'Charger_cnt', 'Unnamed: 0', 'fid', 'gid', 'emd_cd', 'emd_nm_k', 'centerpoint', 'geometry'])

# Align column order between train and test datasets
X_test = X_test[X_train.columns]

# Apply RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Initialize models
rf_model = RandomForestClassifier(random_state=42)
lgb_model = lgb.LGBMClassifier(random_state=42)
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Train each model
rf_model.fit(X_train_smote, y_train_smote)
lgb_model.fit(X_train_smote, y_train_smote)
xgb_model.fit(X_train_smote, y_train_smote)

# Predictions for each model
y_pred_rf = rf_model.predict(X_test_scaled)
y_pred_lgb = lgb_model.predict(X_test_scaled)
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Model evaluation
print("RandomForest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

print("LightGBM Accuracy:", accuracy_score(y_test, y_pred_lgb))
print(confusion_matrix(y_test, y_pred_lgb))
print(classification_report(y_test, y_pred_lgb))

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

# Voting Classifier
estimators = [('rf', rf_model), ('lgb', lgb_model), ('xgb', xgb_model)]
voting_clf = VotingClassifier(estimators=estimators, voting='soft')
voting_clf.fit(X_train_smote, y_train_smote)  # Retrain using SMOTE-applied data

# Predictions and evaluation for Voting Classifier
y_pred_voting = voting_clf.predict(X_test_scaled)
print("Voting Classifier Accuracy (SMOTE):", accuracy_score(y_test, y_pred_voting))
print(confusion_matrix(y_test, y_pred_voting))
print(classification_report(y_test, y_pred_voting))

# Add predictions of each model to the test DataFrame
test['y_pred_rf'] = rf_model.predict(X_test_scaled)
test['y_pred_lgb'] = lgb_model.predict(X_test_scaled)
test['y_pred_xgb'] = xgb_model.predict(X_test_scaled)
test['y_pred_voting'] = voting_clf.predict(X_test_scaled)

# Indices where actual is '0' but model predicts '1', '2', or '3'
fp_indices_rf = test[(test['Charger_type'] == 0) & (test['y_pred_rf'].isin([1, 2, 3]))].index.tolist()
fp_indices_lgb = test[(test['Charger_type'] == 0) & (test['y_pred_lgb'].isin([1,2,3]))].index.tolist()
fp_indices_xgb = test[(test['Charger_type'] == 0) & (test['y_pred