In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# MOdel selection
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import LogisticRegression

# Model hyper parameter tuning
from sklearn import metrics
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_auc_score,roc_curve

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import os


import warnings  
warnings.filterwarnings('ignore')
from sklearn.neighbors import LocalOutlierFactor

In [37]:
def read_file(folder, samp = False, lof=False):
    df = pd.read_csv(folder)
    if samp==True:
        df = df.sample(frac=0.5, replace=False, random_state=42).reset_index(drop=True)
    if lof==True:
        #df_new = pd.DataFrame(scaler.fit_transform(df_new))
        df = filter_lof(df)
        #df_new = scaler.inverse_transform(df_new)
    return df

In [39]:
def filter_lof(df, k=20) :
    lof = LocalOutlierFactor(n_neighbors=k)
    scaler = StandardScaler()
    df2 = pd.DataFrame.copy(df)
    df2 = df2.drop(columns = ['churn'])
    df2 = pd.DataFrame(scaler.fit_transform(df2))
    df2["_lof"] = lof.fit_predict(df2)
    df2 = df2[df2["_lof"]>0].drop(columns="_lof")#.reset_index(drop=True)
    df2 = pd.DataFrame(scaler.inverse_transform(df2))
    df2['churn'] = df['churn']
    return df2.reset_index(drop=True)

In [61]:
def churn_prediction(algorithm,training_x,testing_x,
                             training_y,testing_y,cols,cf,threshold_plot) :
    
    #model
    algorithm.fit(training_x,training_y)
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    #coeffs
    if   cf == "coefficients" :
        coefficients  = pd.DataFrame(algorithm.coef_.ravel())
    elif cf == "features" :
        coefficients  = pd.DataFrame(algorithm.feature_importances_)
        
    column_df     = pd.DataFrame(cols)
    coef_sumry    = (pd.merge(coefficients,column_df,left_index= True,
                              right_index= True, how = "left"))
    coef_sumry.columns = ["coefficients","features"]
    coef_sumry    = coef_sumry.sort_values(by = "coefficients",ascending = False)
    
    #print (algorithm)
    #print ("\n Classification report : \n",classification_report(testing_y,predictions))
    #print ("Accuracy   Score : ",accuracy_score(testing_y,predictions))
    return accuracy_score(testing_y,predictions)
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y,predictions)


In [55]:
def train_model(filename, algo, seed, before) :
    X, Y = readXY(filename, before)
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=seed)
    rfe_algo = algo
    
    accuracy = churn_prediction(rfe_algo, x_train, x_test, y_train, y_test, x_train.columns,"features",threshold_plot = False)
    return rfe_algo, accuracy

In [57]:
def readXY(filename,before=True) :
    df = read_file(filename, samp=False, lof=False)
    churn(df,before)
    df = df.dropna().reset_index(drop=True)
    X = df.drop(columns=['churn'], axis=1)
    Y = df[['churn']]
    return X, Y

In [32]:
def churn_validation(algorithm,testing_x,
                             testing_y,cols) :
    #model
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    
    #print (algorithm)
    #print ("\n Classification report : \n",classification_report(testing_y,predictions))
    #print ("Accuracy   Score : ",accuracy_score(testing_y,predictions))
    return accuracy_score(testing_y,predictions)

In [58]:
def validate_with_raw_data(model, before) :
    X, Y = readXY('/data/dataprivacy/churn/telecom_churn_data_pre_nodates.csv', before)
    acc = churn_validation(model, X,Y, X.columns)
    return acc

# churn label anonymized

In [6]:
df = pd.read_csv('/data/dataprivacy/churn-anonymized/v1/alex2/k10_e10_lof/telecom_churn_data_pre_nodates.csv')

In [16]:
len(df[df['churn']<0.5])/len(df)

0.9093596432496621

In [13]:
data = pd.read_csv('/data/dataprivacy/churn/telecom_churn_data_pre_nodates.csv')

In [17]:
len(data[data['churn']<0.5])/len(data)

0.8980889808898089

# churn label created after 

In [22]:
def is_churned(_x):
    if ((_x.total_ic_mou_9 == 0) & (_x.total_og_mou_9 == 0) & (_x.vol_2g_mb_9 == 0) & (_x.vol_3g_mb_9 == 0)):
        return 1
    else:
        return 0

In [26]:
data_churn = df.drop(['churn'],axis = 1)
data_churn['churn'] = data_churn.apply(is_churned, axis=1)

In [27]:
len(data_churn[data_churn['churn']<0.5])/len(data_churn)

0.9252896069935194

In [51]:
def churn(df, before=True):
    if before == True:
        df.loc[df['churn'] > 0.5, 'churn'] = 1
        df.loc[df['churn'] <= 0.5, 'churn'] = 0
    return df

In [42]:
algos = [RandomForestClassifier(n_jobs=-1,
                                bootstrap=True,
                                max_depth=10,
                                min_samples_leaf=50,
                                min_samples_split=50,
                                n_estimators=60)]

In [63]:
df_eval = pd.DataFrame(columns = ['accuracy'])
before = True

for algo_anon in ['attila1', 'attila2']:
    print(algo_anon)
    if algo_anon=='alex1b' or 'alex2':
        df_original = read_file('/data/dataprivacy/churn/telecom_churn_data_pre_nodates.csv', lof=True)
        df_original = df_original.dropna().reset_index(drop=True)
    if algo_anon=='attila1' or 'attila2':
        df_original = read_file('/data/dataprivacy/churn/telecom_churn_data_pre_nodates.csv')
    for file in os.listdir('/data/dataprivacy/churn-anonymized/v1/'+algo_anon):
        if file.startswith('c') or file.startswith('k'):
            print(file)
            #file beolvasas
            folder = '/data/dataprivacy/churn-anonymized/v1/'+algo_anon+'/'+file+'/'+'telecom_churn_data_pre_nodates.csv'
            #df = read_file(folder)
            #df = df.dropna().reset_index(drop=True)
            ultim_best_scores = 0
            df_ultim_best = pd.DataFrame()
            for seed in [0,1,2,3]:
                print(seed)
                ultim_best_scores = 0
                df_ultim_best = pd.DataFrame()
                for algo in algos:
                    print(algo)
                    model, accuracy_train =  train_model(folder, algo, seed, before)
                    if accuracy_train > ultim_best_scores:
                        ultim_best_scores = accuracy_train
                        best_model = model 
    
                    ###
                df_eval.loc[algo_anon+'_'+file+'_'+str(seed)] = ultim_best_scores
                df_eval.loc[algo_anon+'_'+file+'_'+str(seed)+'_original'] = validate_with_raw_data(best_model,before=True)
                df_eval.to_csv('churn_eval_total_class_attila.csv')

attila1
c3_e1
0
RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=50,
                       n_estimators=60, n_jobs=-1)
1
RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=50,
                       n_estimators=60, n_jobs=-1)
2
RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=50,
                       n_estimators=60, n_jobs=-1)
3
RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=50,
                       n_estimators=60, n_jobs=-1)
c20_e10
0
RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=50,
                       n_estimators=60, n_jobs=-1)
1
RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=50,
                       n_estimators=60, n_jobs=-1)
2
RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=50,
                       n_estimators=60, n_jobs=-1)
3
RandomForestClassifier(max_depth=10, min_samp

1
RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=50,
                       n_estimators=60, n_jobs=-1)
2
RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=50,
                       n_estimators=60, n_jobs=-1)
3
RandomForestClassifier(max_depth=10, min_samples_leaf=50, min_samples_split=50,
                       n_estimators=60, n_jobs=-1)
