In [26]:
import numpy as np  # data science-y operations library
import pandas as pd  # dataframe library
import matplotlib.pyplot as plt  # plot visualization library

In [27]:
# cleanses dataset
def shrinker(df):
    # prune cols that have no values
    description = df.describe(include="all")
    description = description.transpose()
    nonzero_df = df[description[description["count"] > 0].index] 
    # prune cols comprised of only one value
    non_redundant = []
    for col in nonzero_df.columns.values:
        if len(np.unique(nonzero_df[col].tolist())) > 1:
            non_redundant.append(col)
    return nonzero_df[non_redundant]

mem_df = shrinker(pd.read_csv("ClearSkyedata.csv"))

In [50]:
# breaks up IGA Objects' titles into three substring columns
def split_titles(df, col_name):
    title_spl = [x if len(x)==3 else [x[0], x[1]+' '+x[2], x[3]] for x in df[col_name].str.split(' ').tolist()]
    df['title0'] = [x[0] for x in title_spl]
    df['title1'] = [x[1] for x in title_spl]
    df['title2'] = [x[2] for x in title_spl]
split_titles(mem_df, 'ilm_object.description')

In [39]:
# more dataset cleansing, removing specified columns
mem_df_clean = mem_df.fillna("none")
mem_df_clean.drop(["group.description", "group", "ilm_object", "ilm_object.name", "ilm_object.description", 
                    "group.source"], axis=1, inplace=True)

In [40]:
# add fit column
mem_df_clean['fit'] = 1

In [1]:
# append a df of random pseudo datapoints based on given dataset
mem_df_cross = mem_df_clean.copy()
df_dict = {}
for col, values in mem_df_cross.iteritems():
    if col == 'fit':  # indicate the new datapoints don't fit entitlements
        df_dict[col] = np.full(10000, 0)
    else:  # randomize the row value from previous values
        df_dict[col] = np.random.choice(np.unique(values.tolist()), size = 10000)

mem_df_cross = mem_df_cross.append(pd.DataFrame(df_dict))

In [42]:
from sklearn.preprocessing import OneHotEncoder  # tool to discretely quantify qualitative data
from sklearn.model_selection import train_test_split

OH_enc = OneHotEncoder()
X = OH_enc.fit_transform(mem_df_cross.drop('fit', axis = 1))  # create one hot encoded features
y = mem_df_cross['fit']
# split up dataset for model, one to train model, one to test it
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=300)

In [43]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=88)
logreg.fit(X_train, y_train)

LogisticRegression(random_state=88)

In [44]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

y_prob = logreg.predict_proba(X_test)
y_pred = pd.Series([1 if x > 0.5 else 0 for x in y_prob[:,1]], index=y_test.index)

cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))

Confusion Matrix: 
 [[2864  127]
 [  99 2910]]

Accuracy: 0.9623333333333334


In [68]:
real_df = pd.read_csv('realData.csv')
real_df = real_df.fillna('none')
real_df.drop(['parent'], axis=1, inplace=True)
real_df['access'] = (real_df['action'] == 'Keep Access') * 1
real_df.drop(['action'], axis=1, inplace=True)
# TODO: PROBLEM WITH DIFFERING NUMBER OF SUBSTRINGS


Unnamed: 0,ilm_object,parent.ilm_object.description,parent.ilm_object.ref_x_cls_clear_skye_i_account.department,parent.ilm_object.ref_x_cls_clear_skye_i_account.office,access
0,All Sales Users,Chief Technician Manager,Sales,Emeryville,1
1,All Sales Users,Junior Consultant Normal,Sales,Fremont,0
2,All Legal Department Users,Principal Fellow Employee,Legal Department,Milpitas,1
3,All Legal Department Users,Junior Coordinator Employee,Legal Department,Redmond,1
4,All Legal Department Users,Junior President Contract,Legal Department,Orem,1
...,...,...,...,...,...
49894,All Product Testing Users,Chief Admin Contract,Product Testing,Emeryville,1
49895,All Sales Users,Senior Inspector Manager,Sales,Fremont,0
49896,All Product Testing Users,Associate Visionary Contract,Product Testing,Cupertino,0
49897,All Product Testing Users,Master Manager Normal,Product Testing,Abuja,0


In [69]:
X = OH_enc.fit_transform(real_df.drop('access', axis = 1))  # create one hot encoded features
y = real_df['access']
# split up dataset for model, one to train model, one to test it
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=300)

In [70]:
logreg = LogisticRegression(random_state=88)
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=88)

In [72]:
y_prob = logreg.predict_proba(X_test)
y_pred = pd.Series([1 if x > 0.5 else 0 for x in y_prob[:,1]], index=y_test.index)

cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))

Confusion Matrix: 
 [[3630 3792]
 [3694 3854]]

Accuracy: 0.4999331997327989
