In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [35]:
# import the data
trans_df = pd.read_csv('../input/transformed_train_folds.csv')

In [36]:
X_train = trans_df[trans_df['kfold'] != 0]
X_test = trans_df[trans_df['kfold'] == 0]

y_train = X_train['TPSA_tran'].values
y_test = X_test['TPSA_tran'].values

X_train = X_train.drop(columns=['TPSA_tran'])
X_test = X_test.drop(columns=['TPSA_tran'])

In [37]:
pd_df.head()

Unnamed: 0,toxic,FormalCharge,TPSA,MolWt,HeavyAtoms,NHOH,HAcceptors,HDonors,Heteroatoms,AromaticRings,SaturatedRings,AromaticOH,AromaticN,LogP,kfold
0,0,1,46.53,332.464,24,1,3,1,4,1,2,1,0,2.8541,0
1,0,1,60.14,255.363,17,2,2,3,5,0,0,0,0,0.0079,0
2,0,-2,168.58,408.417,27,3,10,2,12,1,1,0,1,-2.7176,0
3,0,0,46.17,183.251,13,1,2,1,3,0,1,0,0,1.1278,0
4,0,0,772.17,1448.686,78,16,51,16,67,0,2,0,0,-19.3965,0


In [38]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [39]:
y_preds = rf.predict(X_test)
r2 = r2_score(y_test, y_preds)
mse = mean_squared_error(y_test, y_preds)

In [40]:
print('R-squared:', r2)
print('MSE:', mse)

R-squared: 0.9519402958646929
MSE: 0.04616606572557608


In [41]:
# import the data
pd_df = pd.read_csv('../input/train_folds.csv')
X_train = pd_df[pd_df['kfold'] != 0]
X_test = pd_df[pd_df['kfold'] == 0]

y_train = X_train['TPSA'].values
y_test = X_test['TPSA'].values

X_train = X_train.drop(columns=['TPSA'])
X_test = X_test.drop(columns=['TPSA'])

In [42]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

y_preds = rf.predict(X_test)
r2 = r2_score(y_test, y_preds)
mse = mean_squared_error(y_test, y_preds)

print('R-squared:', r2)
print('MSE:', mse)

R-squared: 0.9786603755875184
MSE: 250.0848530180831


In [43]:
def cross_fold_model(df, target, model):
    
    print(f'Model {model}')
    
    final_r2, final_mse = [],[]
    
    for fold in df['kfold'].unique():
    
        X_train = pd_df[pd_df['kfold'] != fold]
        X_test = pd_df[pd_df['kfold'] == fold]

        y_train = X_train[target].values
        y_test = X_test[target].values

        X_train = X_train.drop(columns=[target])
        X_test = X_test.drop(columns=[target])
        
        model.fit(X_train, y_train)

        y_preds = model.predict(X_test)
        r2 = r2_score(y_test, y_preds)
        mse = mean_squared_error(y_test, y_preds)
        
        print(f'Fold {fold}, MSE: {mse}, R-squared:{r2}')
        
        final_r2.append(r2)
        final_mse.append(mse)
        
    print(f'Final Scores:\nR-squared:{round(np.mean(final_r2), 4)}, MSE:{round(np.mean(final_mse), 4)}')
    

            
        
        

In [44]:
rf = RandomForestRegressor()

cross_fold_model(pd_df, 'TPSA', rf)

Model RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
Fold 0, MSE: 280.1421415085289, R-squared:0.9760956011139627
Fold 1, MSE: 181.14775522270529, R-squared:0.9695181834542118
Fold 2, MSE: 247.92331553938422, R-squared:0.9653309297840516
Fold 3, MSE: 165.93717743859582, R-squared:0.9664537267219272
Fold 4, MSE: 218.9482337463016, R-squared:0.9782200016231504
Final Scores:
R-squared:0.9711, MSE:218.8197


In [45]:
rf = RandomForestRegressor()

cross_fold_model(trans_df, 'TPSA', rf)

Model RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
Fold 0, MSE: 281.0449455416449, R-squared:0.9760185652649207
Fold 1, MSE: 183.30375672595883, R-squared:0.9691553920841846
Fold 2, MSE: 222.15909839202106, R-squared:0.9689337432241573
Fold 3, MSE: 177.3087688088699, R-squared:0.9641548174744504
Fold 4, MSE: 205.7224945443151, R-squared:0.9795356394496042
Final Scores:
R-squared:0.9716, MSE:213.9078


In [46]:
lr = LinearRegression()
lasso = Lasso()

for model in [lr, lasso]:
    for data in [trans_df, pd_df]:
        if:
            print('Transformed Data')
        else:
            print('Raw Data')
            
        cross_fold_model(data, 'TPSA', model)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().