In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
# Display all rows
pd.set_option('display.max_rows', None)

# Display all columns
pd.set_option('display.max_columns', None)

# Show the full width of the content
pd.set_option('display.width', None)

In [2]:
# Read in data
Gradient = pd.read_csv("../Data/GradientBoosting.csv")
RandomForest = pd.read_csv("../Data/RandomForest.csv")
KNeighbors = pd.read_csv("../Data/KNeighbors.csv")
Actual = pd.read_csv("../Data/AdultCensusUpdated.csv")
NN = pd.read_csv("../Data/SequentialNN.csv")
DT = pd.read_csv("../Data/DecisionTree.csv")
LR = pd.read_csv("../Data/LogisticRegression.csv")

In [3]:
LR.head()

Unnamed: 0,index,age,workclass,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,income,state,Results,Model
0,1,82,Private,9,Widowed,Exec-managerial,Not-in-family,White,Female,18,<=50K,Maryland,<=50K,LogisticRegression
1,4,41,Private,10,Separated,Prof-specialty,Own-child,White,Female,40,<=50K,Florida,<=50K,LogisticRegression
2,13,32,Private,14,Separated,Exec-managerial,Not-in-family,White,Male,55,>50K,New Hampshire,<=50K,LogisticRegression
3,16,45,Private,7,Divorced,Transport-moving,Not-in-family,White,Male,76,>50K,Wyoming,<=50K,LogisticRegression
4,22,61,Private,9,Divorced,Sales,Unmarried,White,Female,25,<=50K,New York,<=50K,LogisticRegression


In [4]:
# Column Names
Gradient.columns

Index(['index', 'age', 'workclass', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'hours_per_week', 'income',
       'state', 'Results', 'Model'],
      dtype='object')

In [5]:
# Column Names
KNeighbors.columns

Index(['Unnamed: 0', 'age', 'workclass', 'fnlwgt', 'education',
       'education.num', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'capital.gain', 'capital.loss', 'hours.per.week',
       'native.country', 'income', 'State', 'Results', 'Model'],
      dtype='object')

In [6]:
# Data Cleaning
RandomForest['Results'] = RandomForest['Results'].replace({'>50K': 1, '<=50K': 0})
Gradient['Results'] = Gradient['Results'].replace({'>50K': 1, '<=50K': 0})
LR['Results'] = LR['Results'].replace({'>50K': 1, '<=50K': 0})

  RandomForest['Results'] = RandomForest['Results'].replace({'>50K': 1, '<=50K': 0})
  Gradient['Results'] = Gradient['Results'].replace({'>50K': 1, '<=50K': 0})
  LR['Results'] = LR['Results'].replace({'>50K': 1, '<=50K': 0})


In [7]:
# Drop unnecessary columns
KNeighbors = KNeighbors.drop(columns= ["fnlwgt", "education", "capital.gain", "capital.loss", "native.country"])
DT = DT.drop(columns= ["fnlwgt", "education", "capital.gain", "capital.loss", "native.country"])
# Rename Columns
KNeighbors.columns = ['index', 'age', 'workclass', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'hours_per_week', 'income',
       'state', 'Results', 'Model']
DT.columns = ['index', 'age', 'workclass', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'hours_per_week', 'income',
       'state', 'Results', 'Model']


In [8]:
# Column Names
Actual.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income', 'State'],
      dtype='object')

In [9]:
# Combine the dataframes row-wise
combined_df = pd.concat([Gradient, RandomForest, KNeighbors, NN, DT, LR], ignore_index=True)

In [10]:
# Clean data
X = len(Actual)
null_values = []
model = []
for y in range(len(Actual)):
    null_values.append("N/A")
    model.append("Actual")
Actual["index"] = null_values
Actual["Results"] = null_values
Actual["Model"] = model

In [11]:
combined_df.loc[combined_df["Model"] == "Sequential NN", ["Model"]] = "NeuralNetwork"

In [12]:
# Drop unneccesary Columns
Actual = Actual.drop(columns= ["fnlwgt", "education", "capital.gain", "capital.loss", "native.country"])
Actual.columns

Index(['age', 'workclass', 'education.num', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'hours.per.week', 'income', 'State',
       'index', 'Results', 'Model'],
      dtype='object')

In [13]:
# Reorder columns manually
desired_columns = ['index', 'age', 'workclass', 'education.num', 'marital.status',
       'occupation', 'relationship', 'race', 'sex', 'hours.per.week', 'income',
       'State', 'Results', 'Model']
Actual = Actual[desired_columns]

In [14]:
# Rename Columns
Actual.columns = ['index', 'age', 'workclass', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'hours_per_week', 'income',
       'state', 'Results', 'Model']

In [15]:
# Combine the dataframes row-wise
combined_df = pd.concat([combined_df, Actual], ignore_index=True)

In [16]:
# Confirm Results
combined_df.tail()

Unnamed: 0,index,age,workclass,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,income,state,Results,Model
79097,,22,Private,10,Never-married,Protective-serv,Not-in-family,White,Male,40,<=50K,North Dakota,,Actual
79098,,27,Private,12,Married-civ-spouse,Tech-support,Wife,White,Female,38,<=50K,Texas,,Actual
79099,,40,Private,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,>50K,Colorado,,Actual
79100,,58,Private,9,Widowed,Adm-clerical,Unmarried,White,Female,40,<=50K,Kansas,,Actual
79101,,22,Private,9,Never-married,Adm-clerical,Own-child,White,Male,20,<=50K,New Hampshire,,Actual


In [17]:
classification_df = pd.read_csv("../Data/mergedClassificationReport.csv")

In [18]:
classification_df.head()

Unnamed: 0.1,Unnamed: 0,class,precision,recall,f1-score,support,model
0,0,0,0.872438,0.895341,0.883741,5752,NeuralNetwork
1,1,1,0.661227,0.60944,0.634278,1928,NeuralNetwork
2,2,accuracy,0.823568,0.823568,0.823568,0,NeuralNetwork
3,3,macro avg,0.766832,0.75239,0.759009,7680,NeuralNetwork
4,4,weighted avg,0.819415,0.823568,0.821115,7680,NeuralNetwork


In [19]:
classification_df = classification_df.rename(columns={"model": "Model"})

In [20]:
# Merge
merged_df = pd.merge(combined_df, classification_df, on="Model", how="left")
merged_df.columns

Index(['index', 'age', 'workclass', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'hours_per_week', 'income',
       'state', 'Results', 'Model', 'Unnamed: 0', 'class', 'precision',
       'recall', 'f1-score', 'support'],
      dtype='object')

In [21]:

merged_df = merged_df.drop(columns= ["Unnamed: 0"])

In [22]:
merged_df.head()

Unnamed: 0,index,age,workclass,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,income,state,Results,Model,class,precision,recall,f1-score,support
0,1,82,Private,9,Widowed,Exec-managerial,Not-in-family,White,Female,18,<=50K,Maryland,0.0,GradientBoosting,0,0.867816,0.918811,0.892586,5752.0
1,1,82,Private,9,Widowed,Exec-managerial,Not-in-family,White,Female,18,<=50K,Maryland,0.0,GradientBoosting,1,0.706289,0.582469,0.638431,1928.0
2,1,82,Private,9,Widowed,Exec-managerial,Not-in-family,White,Female,18,<=50K,Maryland,0.0,GradientBoosting,accuracy,0.834375,0.834375,0.834375,0.0
3,1,82,Private,9,Widowed,Exec-managerial,Not-in-family,White,Female,18,<=50K,Maryland,0.0,GradientBoosting,macro avg,0.787053,0.75064,0.765508,7680.0
4,1,82,Private,9,Widowed,Exec-managerial,Not-in-family,White,Female,18,<=50K,Maryland,0.0,GradientBoosting,weighted avg,0.827266,0.834375,0.828782,7680.0


In [23]:
merged_df.loc[merged_df["Model"] == "Actual", ["precision", "recall", "f1-score", "support", "class"]] = "N/A"

  merged_df.loc[merged_df["Model"] == "Actual", ["precision", "recall", "f1-score", "support", "class"]] = "N/A"
  merged_df.loc[merged_df["Model"] == "Actual", ["precision", "recall", "f1-score", "support", "class"]] = "N/A"
  merged_df.loc[merged_df["Model"] == "Actual", ["precision", "recall", "f1-score", "support", "class"]] = "N/A"
  merged_df.loc[merged_df["Model"] == "Actual", ["precision", "recall", "f1-score", "support", "class"]] = "N/A"


In [24]:
merged_df.tail()

Unnamed: 0,index,age,workclass,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,income,state,Results,Model,class,precision,recall,f1-score,support
265261,,22,Private,10,Never-married,Protective-serv,Not-in-family,White,Male,40,<=50K,North Dakota,,Actual,,,,,
265262,,27,Private,12,Married-civ-spouse,Tech-support,Wife,White,Female,38,<=50K,Texas,,Actual,,,,,
265263,,40,Private,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,>50K,Colorado,,Actual,,,,,
265264,,58,Private,9,Widowed,Adm-clerical,Unmarried,White,Female,40,<=50K,Kansas,,Actual,,,,,
265265,,22,Private,9,Never-married,Adm-clerical,Own-child,White,Male,20,<=50K,New Hampshire,,Actual,,,,,


In [25]:
# Export Data
merged_df.to_csv("../Data/mergedDataFrame.csv")