In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,StandardScaler

scaler = StandardScaler()
sns.set_theme(style="darkgrid")

le = LabelEncoder()

In [None]:
train = pd.read_csv('queried_data/optrain.csv')
test = pd.read_csv('queried_data/optest.csv')

list1 = pd.read_csv('queried_data/list1_query.csv')
list2 = pd.read_csv('queried_data/list2_query.csv')

In [None]:
train.drop_duplicates(subset=['address'],inplace=True)
test.drop_duplicates(subset=['address'],inplace=True)
list1.drop_duplicates(subset=['address'],inplace=True)
list2.drop_duplicates(subset=['address'],inplace=True)

In [None]:
bins = [-1, 7, 14, 21, 30, 69]
labels = ['Less than a week', '1-2 weeks', '2-3 weeks', '3-4 weeks', 'Over 4 weeks']

# create new column with duration labels
train['duration'] = pd.cut(train['Days_held'], bins=bins, labels=labels)
test['duration'] = pd.cut(test['Days_held'], bins=bins, labels=labels)


In [None]:
X = train.drop(['Days_held','address','duration','Amount_Sold(OP)','percentage_spent','Amount_Received(OP)','Past_participant'],axis=1)

le.fit(train['duration'])
y = le.transform(train['duration'])

In [None]:
pivot_table = pd.pivot_table(train, values='Days_held', index='duration', aggfunc='count', margins=True)
pivot_table.columns = ['count']
pivot_table['percentage'] = pivot_table['count'] / pivot_table.loc['All', 'count'] * 100
print(pivot_table)

In [None]:
X_test = test.drop(['Days_held','address','duration','Amount_Sold(OP)','percentage_spent','Amount_Received(OP)','Past_participant'],axis=1)
y_test = le.transform(test['duration'])

In [None]:
rfc = RandomForestClassifier(n_estimators=200, random_state=42)
rfc.fit(X, y)

rfc_pred = rfc.predict(X_test)

rfc_acc = accuracy_score(y_test, rfc_pred)


print(f"Random Forest Accuracy: {rfc_acc}")


In [None]:
importances = rfc.feature_importances_
feature_names = X.columns
df_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})

plt.figure(figsize=(10, 5))
sns.barplot(x='importance', y='feature', data=df_importances, orient='h')
plt.title('Feature importances')
plt.show()

In [None]:
rfc_pred[:10],y_test[:10]

In [None]:
lis11_prediction = le.inverse_transform(rfc.predict(list1.drop('address', axis=1)))
list2_prediction = le.inverse_transform(rfc.predict(list2.drop('address',axis=1)))

list1['Predicted_Holding_days'] = lis11_prediction
list2['Predicted_Holding_days'] = list2_prediction

In [None]:
list1['Predicted_Holding_days'].value_counts()

In [None]:
list2['Predicted_Holding_days'].value_counts()

In [None]:
best_choice1 = list1.loc[(list1['Predicted_Holding_days']=='Over 4 weeks')|(list1['Predicted_Holding_days']=='3-4 weeks')]
best_choice2 = list2.loc[(list2['Predicted_Holding_days']=='Over 4 weeks') | (list2['Predicted_Holding_days']=='3-4 Weeks')]

In [None]:
best_list = pd.concat([best_choice1,best_choice2])
best_list.to_csv('data/best_list.csv',index=False)

In [None]:
list1.to_csv('classified_data/list1_query_classified.csv',index=False)
list2.to_csv('classified_data/list2_query_classified.csv',index=False)

In [None]:
value_counts = train['duration'].value_counts()
percentage = value_counts.div(len(train)).mul(100)
percentage

In [None]:
import pickle

with open('model_weights/rfc.pkl', 'wb') as f:
    pickle.dump(rfc, f)

with open('model_weights/rfc.pkl', 'rb') as f:
    rfc = pickle.load(f)

# Use the loaded classifier to make predictions
y_pred = rfc.predict(X)

rfc_acc = accuracy_score(y_test, rfc_pred)


print(f"Random Forest Accuracy: {rfc_acc}")