In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

train = pd.read_csv("fall_project_dataset/development.csv", index_col=0)
test  = pd.read_csv("fall_project_dataset/evaluation.csv", index_col=0)

# Create a dictionary from the OCCP code to the text representation
import csv
reader = csv.reader(open('produced_documents/occp_to_string.csv', 'r'), delimiter=';')
d = {}
for row in reader:
   k, v = row
   k = float(k)
   d[k] = v
d

# Map the OCCP column to its text values
train["OCCP"] = train["OCCP"].map(d)
test["OCCP"]  = test["OCCP"].map(d)

# Assuming you have 'train' and 'test' DataFrames
train['JWDP_AP'] = train['JWAP'] - train['JWDP']
test['JWDP_AP'] = test['JWAP'] - test['JWDP']

# Create dummy variables for 'OCCP' in the train/test DataFrame
#train_dummies = pd.get_dummies(train['OCCP'], prefix='V')
#test_dummies = pd.get_dummies(test['OCCP'], prefix='V')

# Concatenate the dummy variables with the original DataFrames
#train = pd.concat([train, train_dummies], axis=1)
#test = pd.concat([test, test_dummies], axis=1)

# Drop the original 'OCCP' column if no longer needed
train = train.drop(columns=['OCCP'])
test = test.drop(columns=['OCCP'])

In [2]:
X_train = train.drop(columns=['JWMNP'])
y_train = train['JWMNP']

# Create and train the Decision Tree Classifier
clf = DecisionTreeClassifier(criterion='gini', min_impurity_decrease=0.0001, min_samples_split=100, min_samples_leaf=50)

clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_train)

# Calculate and print accuracy
accuracy = accuracy_score(y_train, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print confusion matrix
conf_matrix = confusion_matrix(y_train, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Evaluate the model
print(classification_report(y_train, y_pred))

# Get feature importances
feature_importances = clf.feature_importances_

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Filter the DataFrame to select variables with importance > 0.0001
importance_df = importance_df[importance_df['Importance'] > 0.0001]

# Print the sorted feature importances
print('Feature Importances:')
print(importance_df)

# Count the occurrences of each unique value
unique_values, counts = np.unique(y_pred, return_counts=True)
# Create a DataFrame to store the counts
time_frequencies = pd.DataFrame({'Value': unique_values, 'Count': counts})
# Reset the index
time_frequencies = time_frequencies.reset_index(drop=True)
# Print the result
print(time_frequencies)

Accuracy: 85.88%
Confusion Matrix:
[[ 494  110    0 ...    0    0    0]
 [ 392  162    0 ...    0    0    0]
 [ 299  151    0 ...    0    0    0]
 ...
 [   0    0    0 ...    0    0    0]
 [   0    0    0 ...    0    0    8]
 [   4    0    0 ...    0    0 1294]]
              precision    recall  f1-score   support

         1.0       0.37      0.64      0.47       770
         2.0       0.34      0.22      0.27       723
         3.0       0.00      0.00      0.00       624
         4.0       0.00      0.00      0.00       243
         5.0       0.64      0.92      0.76      5697
         6.0       0.00      0.00      0.00       430
         7.0       0.00      0.00      0.00      1128
         8.0       0.00      0.00      0.00       907
         9.0       0.00      0.00      0.00       155
        10.0       0.83      0.94      0.88     11817
        11.0       0.00      0.00      0.00       134
        12.0       0.00      0.00      0.00      1071
        13.0       0.00      0.00 

  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
print(test.columns)
print(len(train))
#predict test values
y_predt = clf.predict(test)
# Concatenate 'id' column with 'y_pred'
# Calculate the starting value for the 'id' column
start_id = len(train)
# Create an 'id' column starting from 'start_id' and increasing by 1
test['id'] = np.arange(start_id, start_id + len(test))
test['y_pred'] = y_predt
# Concatenate 'id' column with 'y_pred'
print(test.columns)
result = test[['id','y_pred']]
# Print the resulting DataFrame
result.columns = ['Id', 'Predicted']
 
print(result)

#save submition file
result.to_csv("submition.csv", index=False)

Index(['COW', 'SCHL', 'MAR', 'POBP', 'WKHP', 'SEX', 'RAC1P', 'MIG', 'HICOV',
       'LANP', 'PAOC', 'PINCP', 'PUBCOV', 'VPS', 'DEAR', 'MIL', 'MIGSP', 'FER',
       'ENG', 'JWAP', 'JWDP', 'OC', 'FDEYEP', 'JWDP_AP'],
      dtype='object')
104642
Index(['COW', 'SCHL', 'MAR', 'POBP', 'WKHP', 'SEX', 'RAC1P', 'MIG', 'HICOV',
       'LANP', 'PAOC', 'PINCP', 'PUBCOV', 'VPS', 'DEAR', 'MIL', 'MIGSP', 'FER',
       'ENG', 'JWAP', 'JWDP', 'OC', 'FDEYEP', 'JWDP_AP', 'id', 'y_pred'],
      dtype='object')
            Id  Predicted
Id                       
104642  104642       60.0
104643  104643        5.0
104644  104644       15.0
104645  104645        5.0
104646  104646        2.0
...        ...        ...
130797  130797        5.0
130798  130798       15.0
130799  130799       25.0
130800  130800       15.0
130801  130801       30.0

[26160 rows x 2 columns]
