In [15]:
import pandas as pd
import dask.dataframe as dd
from dask.multiprocessing import get

import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model, svm, tree
from sklearn.ensemble import RandomForestRegressor
from  sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
#import swifter

np.random.seed(1907)

In [16]:
from tqdm import tqdm
# from tqdm.auto import tqdm  # for notebooks

#tqdm.pandas()

In [17]:
def initial_data_preparation():
    try:
        data = pd.read_csv("survey_results_public.csv")
        print("Data loaded locally.")
    except:
        data = pd.read_csv(r"https://raw.githubusercontent.com/ahmetsirel/ozu_data_science/master/DS%20555%20-%20Data%20Science%20%26%20Strategy/Project/survey_results_public.csv")
        print("Data loaded from github.")
    
    
    print(f"{len(data)} Rows.")

    # Numeric columns
    data["Age1stCode"] = data["Age1stCode"].replace('Younger than 5 years', "3").replace('Older than 85', "90").astype(float)
    data["YearsCode"] = data["YearsCode"].replace('Less than 1 year', "0.5").replace('More than 50 years', "55").astype(float)
    data["YearsCodePro"] = data["YearsCodePro"].replace('Less than 1 year', "0.5").replace('More than 50 years', "55").astype(float)

    # Drop unnecessary columns
    columns_to_drop = ["CurrencySymbol", "CompFreq", "CurrencyDesc", "CompTotal", "Respondent"]
    data = data.drop(columns_to_drop, axis=1)

    # Process multi choise columns
    def get_all_distinct_choices(column_name):
        list_of_choices = [str(item).split(";") for item in data[column_name].unique()]
        all_choices = []
        for ch in list_of_choices:
            all_choices += ch

        all_choices.remove("nan")
        all_choices = pd.Series(all_choices).unique().tolist()
        return all_choices

    def detect_multi_choise(column):
        return column.str.contains(";").sum() > 0

    is_multi_choice = data.loc[:, data.dtypes == "object"].apply(detect_multi_choise)
    multi_choice_columns = is_multi_choice[is_multi_choice].index.to_list()

    distinct_choice_lists_for_each_columns = {column: get_all_distinct_choices(column) for column in multi_choice_columns}

    def sep_multi_choice(row):
        #if int(row.name) % 5 ==0:
        #print(round((int(row.name) / len(data)),2))

        for column in multi_choice_columns:
            for choise in distinct_choice_lists_for_each_columns[column]:
                if type(row[column]) is str:
                    if  choise in row[column]:
                        row[column + "_" + choise] = 1
                    else:
                        row[column + "_" + choise] = 0
 
        return row

    try:
        data = pd.read_csv("data_prepared_temp_after_apply.csv")     
    except Exception as e:
        print(e)
        print("Multi choice columns processing is started.")
        #ddata = dd.from_pandas(data, npartitions=32)
        #data = ddata.map_partitions(lambda df: data.apply(sep_multi_choice, axis=1)).compute(get=get) 
        data = data.apply(sep_multi_choice, axis=1)
        print("Multi choice columns processed.")
    data = data.drop(multi_choice_columns, axis=1)
    
    # One hot encoding
    cat_columns = ["Country", "Gender", "JobSat", "JobSeek",
                                        "Employment",
                                        "MainBranch", 
                                        "Hobbyist", 
                                        "EdLevel", 
                                        "NEWDevOps", 
                                        "NEWDevOpsImpt", 
                                        "NEWEdImpt", 
                                        "NEWLearn",
                                        "NEWOffTopic",
                                        "NEWOnboardGood",
                                        "NEWOtherComms",
                                        "NEWOvertime",
                                        "NEWPurpleLink","OpSys","OrgSize",
                                        "PurchaseWhat","SOAccount",
                                        "SOComm","SOPartFreq","SOVisitFreq",
                                        "SurveyEase", "SurveyLength","Trans", "UndergradMajor","WelcomeChange"]
    cat_columns = [col for col in cat_columns if col not in multi_choice_columns]
    data = pd.get_dummies(data=data, columns=cat_columns)

    
    data = data.dropna(subset=["ConvertedComp"], axis=0)
    # Fill NaNs with mean
    nas = data.isna().sum()
    nas = nas[nas > 0]

    for na_col in nas.index:
        data[na_col].fillna(data[na_col].mean(), inplace=True)

    #LABEL :"ConvertedComp"
    return data

In [18]:
try:
    raise Exception
    data = pd.read_csv("data_prepared.csv")
except:
    data = initial_data_preparation()
    data.to_csv("data_prepared.csv",index=False)


Data loaded locally.
64461 Rows.


In [19]:
def prepare_data(data=data, exclude_columns=None, features_to_use=None, apply_x_col=None, columns_to_bin=None):

    
    if exclude_columns is not None and features_to_use is not None:
        raise Exception("exclude_columns and features_to_use cannot be used at the same time.")

    if exclude_columns is not None:
        data = data.drop(exclude_columns, axis=1)

    if features_to_use is not None:
        data = data[features_to_use + ["revenue"]]

    x = data.drop('ConvertedComp', axis=1)
    
    y = data['ConvertedComp']

    if apply_x_col is not None:
        x = x.apply(apply_x_col)
    

    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size = 0.25, random_state = 42)

    if columns_to_bin is not None:
        for column in columns_to_bin:
            if column not in x_train.columns: continue
            x_train.loc[:,column], bins_ = pd.qcut(x_train.loc[:,column], q=4,  retbins=True, duplicates="drop")
            x_test.loc[:,column] = pd.cut(x_test.loc[:,column], bins=bins_, )
            
            x_train = pd.get_dummies(x_train)
            x_test = pd.get_dummies(x_test)
    
    return x_train, x_test, y_train, y_test, data

In [20]:
def scale_data(x_train, x_test, scaler=StandardScaler):

    scaler = scaler()
    scaler.fit(x_train)

    x_train_scaled = scaler.transform(x_train)
    x_train_scaled = pd.DataFrame(x_train_scaled, index=x_train.index, columns=x_train.columns)

    x_test_scaled = scaler.transform(x_test)
    x_test_scaled = pd.DataFrame(x_test_scaled, index=x_test.index, columns=x_test.columns)

    x_train_scaled.fillna(x_train_scaled.mean(), inplace=True)
    x_test_scaled.fillna(x_test_scaled.mean(), inplace=True)

    return x_train_scaled, x_test_scaled

In [21]:
def evaluate_regression(y_true_train, y_pred_train, y_true_test, y_pred_test, model_name="", model=None, num_feat=""):   
    return pd.DataFrame.from_records([[
                                       metrics.mean_squared_error(y_true_train, y_pred_train),
                                       metrics.mean_absolute_error(y_true_train, y_pred_train),
                                       metrics.max_error(y_true_train, y_pred_train),
                                       metrics.r2_score(y_true_train, y_pred_train), 
                                      metrics.mean_squared_error(y_true_test, y_pred_test),
                                       metrics.mean_absolute_error(y_true_test, y_pred_test),
                                       metrics.max_error(y_true_test, y_pred_test),
                                       metrics.r2_score(y_true_test, y_pred_test),
                                       model,
                                       num_feat]],
                                     
                                     index=[model_name], 
                                     columns=['mean_squared_error_train', 'mean_absolute_error_train', 'max_error_train', "r2_score_train",
                                              'mean_squared_error_test', 'mean_absolute_error_test', 'max_error_test', "r2_score_test", "model_object", "num_feat"])


In [22]:
def run_models(x_train, x_test, y_train, y_test, feture_elimination_num_feat=None):
        results = pd.DataFrame()
        for model in [linear_model.LinearRegression(), 
                        linear_model.RidgeCV(),
                        linear_model.LassoCV(),
                        svm.LinearSVR(),
                        #svm.SVR(kernel="rbf",),
                        #svm.SVR(kernel="poly"),
                        tree.DecisionTreeRegressor(),
                        RandomForestRegressor()]:
                
                if feture_elimination_num_feat is not None :
                        try:
                                if feture_elimination_num_feat == "auto":
                                        feture_elimination_num_feat = np.linspace(5,len(x_train.columns),5, dtype=int)

                                for n_features_to_select in feture_elimination_num_feat:
                                        model_ = RFE(estimator=model, n_features_to_select=n_features_to_select)
                                        model_.fit(x_train, y_train)
                                        y_train_pred = model_.predict(x_train)
                                        y_test_pred = model_.predict(x_test)

                                        results = results.append(evaluate_regression(y_train, y_train_pred, y_test, y_test_pred,
                                                                model_name=type(model_).__name__, model=model_, num_feat=n_features_to_select))
                                return results
                        except Exception as e:
                            print(e)
                                
                        
               
                model.fit(x_train, y_train)
                
                y_train_pred = model.predict(x_train)
                y_test_pred = model.predict(x_test)

                results = results.append(evaluate_regression(y_train, y_train_pred, y_test, y_test_pred,
                                        model_name=type(model).__name__, model=model, num_feat=len(x_train.columns) ))
        return results


In [23]:
experiment_list = pd.DataFrame()

def run(exclude_columns=None, features_to_use=None, name="", apply_x_col=None, x_train_=None, y_train_=None, columns_to_bin=None, feture_elimination_num_feat=None):
    global experiment_list
    print(name)

    x_train, x_test, y_train, y_test, data = prepare_data(exclude_columns=exclude_columns, 
                                                        features_to_use=features_to_use, 
                                                        apply_x_col=apply_x_col,
                                                        columns_to_bin=columns_to_bin)

    if x_train_ is not None or y_train_ is not None:
        x_train, y_train = x_train_, y_train_

    x_train, x_test = scale_data(x_train, x_test, scaler=StandardScaler)

    results = run_models(x_train, x_test, y_train, y_test, feture_elimination_num_feat=feture_elimination_num_feat)

    print("Min mean_squared_error_test Test",results.sort_values("mean_squared_error_test").iloc[0]["mean_squared_error_test"])

    best_model = results.sort_values("mean_squared_error_test").reset_index().loc[[0], ["mean_squared_error_train", 
                                                                                        "mean_squared_error_test", 
                                                                                        "index",
                                                                                        "num_feat",
                                                                                        "model_object"]].rename({0:name, "index":"model_name"})
    if experiment_list is not None:
        experiment_list = experiment_list.append(best_model)

    return results.sort_values("mean_squared_error_test")

In [24]:
results = run(exclude_columns=None, name="Initial Run")

Initial Run
Min mean_squared_error_test Test 43085108519.26064


In [25]:
results

Unnamed: 0,mean_squared_error_train,mean_absolute_error_train,max_error_train,r2_score_train,mean_squared_error_test,mean_absolute_error_test,max_error_test,r2_score_test,model_object,num_feat
LassoCV,46723600000.0,82652.62627,1898065.0,0.111645,43085110000.0,79170.81,1918945.0,0.1042174,LassoCV(),609
RidgeCV,46052360000.0,87719.792353,1889642.0,0.124407,43663720000.0,85791.03,1914107.0,0.09218736,"RidgeCV(alphas=array([ 0.1, 1. , 10. ]))",609
RandomForestRegressor,6851607000.0,34833.819971,926861.7,0.86973,45622470000.0,89245.03,1916067.0,0.05146307,"(DecisionTreeRegressor(max_features='auto', ra...",609
LinearSVR,58330870000.0,83422.567709,1991409.0,-0.109044,53092650000.0,78943.92,1989502.0,-0.1038494,LinearSVR(),609
DecisionTreeRegressor,0.0,0.0,0.0,1.0,105498700000.0,110177.8,2000000.0,-1.193424,DecisionTreeRegressor(),609
LinearRegression,46054450000.0,87817.236287,1889829.0,0.124367,4.025664e+30,21524540000000.0,1.870267e+17,-8.36976e+19,LinearRegression(),609


In [26]:
x_train, x_test, y_train, y_test, data = prepare_data()

In [27]:
# All Features
x_train.columns.to_list()

['Age',
 'Age1stCode',
 'DatabaseDesireNextYear_Cassandra',
 'DatabaseDesireNextYear_Couchbase',
 'DatabaseDesireNextYear_DynamoDB',
 'DatabaseDesireNextYear_Elasticsearch',
 'DatabaseDesireNextYear_Firebase',
 'DatabaseDesireNextYear_IBM DB2',
 'DatabaseDesireNextYear_MariaDB',
 'DatabaseDesireNextYear_Microsoft SQL Server',
 'DatabaseDesireNextYear_MongoDB',
 'DatabaseDesireNextYear_MySQL',
 'DatabaseDesireNextYear_Oracle',
 'DatabaseDesireNextYear_PostgreSQL',
 'DatabaseDesireNextYear_Redis',
 'DatabaseDesireNextYear_SQLite',
 'DatabaseWorkedWith_Cassandra',
 'DatabaseWorkedWith_Couchbase',
 'DatabaseWorkedWith_DynamoDB',
 'DatabaseWorkedWith_Elasticsearch',
 'DatabaseWorkedWith_Firebase',
 'DatabaseWorkedWith_IBM DB2',
 'DatabaseWorkedWith_MariaDB',
 'DatabaseWorkedWith_Microsoft SQL Server',
 'DatabaseWorkedWith_MongoDB',
 'DatabaseWorkedWith_MySQL',
 'DatabaseWorkedWith_Oracle',
 'DatabaseWorkedWith_PostgreSQL',
 'DatabaseWorkedWith_Redis',
 'DatabaseWorkedWith_SQLite',
 'DevType