## Model Training

### importing necessary libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# modeling libraries
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

In [3]:
X = pd.read_csv('./data/feature_matrix.csv')
X.head()

Unnamed: 0,country_freq,year,city_freq,main_category_freq,funding_rounds,funding_filled,first_funding_year,last_funding_year,funding_duration_days,avg_funding_per_round
0,1736,2015,288,226,1,10000000,2015,2015,0,9999900.0
1,40920,2014,5,228,2,700000,2014,2014,227,349998.3
2,4009,2014,7927,1501,1,3406878,2014,2014,0,3406844.0
3,1714,2007,573,2181,1,2000000,2008,2008,0,1999980.0
4,40920,2010,32,4010,1,10070591,2014,2014,0,10070490.0


In [4]:
y = pd.read_csv('./data/target_variable.csv')
y.head()

Unnamed: 0,success
0,False
1,False
2,False
3,False
4,False


In [5]:
y = y.values.ravel() 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [8]:
scaler = StandardScaler()
X_train_res = scaler.fit_transform(X_train_res)
X_test = scaler.transform(X_test)


In [9]:
log_reg = LogisticRegression(class_weight='balanced', random_state=42)
log_reg.fit(X_train_res, y_train_res)
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log_reg))
print("accuracy:", log_reg.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred_log_reg))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

       False       0.95      0.78      0.86     11657
        True       0.25      0.66      0.36      1315

    accuracy                           0.76     12972
   macro avg       0.60      0.72      0.61     12972
weighted avg       0.88      0.76      0.81     12972

accuracy: 0.7642614862781375
[[9049 2608]
 [ 450  865]]


In [10]:
rf_clf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_clf.fit(X_train_res, y_train_res)
y_pred_rf = rf_clf.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("accuracy:", rf_clf.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

       False       0.94      0.89      0.91     11657
        True       0.33      0.48      0.39      1315

    accuracy                           0.85     12972
   macro avg       0.63      0.68      0.65     12972
weighted avg       0.88      0.85      0.86     12972

accuracy: 0.8493678692568609
[[10389  1268]
 [  686   629]]


In [11]:
xgb_clf = XGBClassifier(scale_pos_weight= (y_train==0).sum() / (y_train==1).sum(), eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train_res, y_train_res)
y_pred_xgb = xgb_clf.predict(X_test)
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))
print("accuracy:", xgb_clf.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred_xgb))

XGBoost Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.68      0.79     11657
        True       0.22      0.79      0.34      1315

    accuracy                           0.69     12972
   macro avg       0.59      0.73      0.57     12972
weighted avg       0.89      0.69      0.75     12972

accuracy: 0.6869411039161271
[[7874 3783]
 [ 278 1037]]


In [12]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=3),
    n_estimators=200,
    learning_rate=0.5,
    random_state=42
)

ada_clf.fit(X_train_res, y_train_res)
y_pred_ada = ada_clf.predict(X_test)
print("AdaBoost Classification Report:")
print(classification_report(y_test, y_pred_ada))
print("accuracy:", ada_clf.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred_ada))

AdaBoost Classification Report:
              precision    recall  f1-score   support

       False       0.95      0.85      0.89     11657
        True       0.30      0.59      0.40      1315

    accuracy                           0.82     12972
   macro avg       0.63      0.72      0.65     12972
weighted avg       0.88      0.82      0.84     12972

accuracy: 0.8197656490903484
[[9855 1802]
 [ 536  779]]


In [13]:
# For CatBoost
df_cat = pd.read_csv('./data/feature_matrix_cat.csv')
df_cat.head()


Unnamed: 0,country,year,city,main_category,funding_rounds,funding_filled,first_funding_year,last_funding_year,funding_duration_days,avg_funding_per_round
0,IND,2015,Mumbai,Media,1,10000000,2015,2015,0,9999900.0
1,USA,2014,Delaware City,Application Platforms,2,700000,2014,2014,227,349998.3
2,GBR,2014,Unknown,Apps,1,3406878,2014,2014,0,3406844.0
3,CHN,2007,Beijing,Curated Web,1,2000000,2008,2008,0,1999980.0
4,USA,2010,Champaign,Software,1,10070591,2014,2014,0,10070490.0


In [14]:
df_cat.isnull().sum()

country                  0
year                     0
city                     0
main_category            0
funding_rounds           0
funding_filled           0
first_funding_year       0
last_funding_year        0
funding_duration_days    0
avg_funding_per_round    0
dtype: int64

In [17]:

X = df_cat 
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of categorical columns
cat_features = ['country', 'city', 'main_category']

# Train CatBoost
cat_clf = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    eval_metric='F1',
    class_weights=[1, (y_train==0).sum() / (y_train==1).sum()],
    random_seed=42,
    logging_level='Silent',
)

cat_clf.fit(    X_train, y_train,eval_set=(X_test, y_test),cat_features=cat_features,use_best_model=True,early_stopping_rounds=50)
y_pred = cat_clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.97      0.76      0.85     11657
        True       0.27      0.78      0.40      1315

    accuracy                           0.76     12972
   macro avg       0.62      0.77      0.62     12972
weighted avg       0.90      0.76      0.80     12972



In [18]:
print("accuracy:", cat_clf.score(X_test, y_test))

accuracy: 0.759481961147086
