In [None]:
import pandas as pd
from pandasql import sqldf
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
pysqldf = lambda q: sqldf(q, globals())

In [None]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [None]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/geoffswc/Internet-Archives-Transcripts/main/data/final_dataset.csv")

In [None]:
#df

In [None]:
pysqldf("""
SELECT category, count(*) as category_count
FROM df
GROUP BY category
ORDER BY category_count DESC
""")

In [None]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=stop)),
    ('clf', RandomForestClassifier())
    ])

In [None]:
X_train = text_clf['vect'].fit_transform(df_train['computer_transcript'])

In [None]:
text_clf['vect'].get_feature_names()

In [None]:
X_train.todense()

In [None]:
df_vector = pd.DataFrame(X_train.todense())
df_vector.columns = text_clf['vect'].get_feature_names()

In [None]:
pd.set_option('display.max_rows', None)
df_vector.T

In [None]:
df_train.iloc[3]['computer_transcript']

In [None]:
X_train = text_clf.fit(df_train['computer_transcript'], df_train['category'])

In [None]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names(), 
                           'importance': text_clf['clf'].feature_importances_})

In [None]:
pysqldf("""
SELECT 
    *
FROM
    feature_df
ORDER BY
    importance DESC
""")

In [None]:
feature_df.sort_values(by=['importance'], ascending=False)

In [None]:
cross_val = cross_val_score(text_clf, df_test['computer_transcript'], df_test['category'], cv=4)

In [None]:
cross_val

In [None]:
y_proba = text_clf.predict_proba(df_test['computer_transcript'])
y_pred = text_clf.predict(df_test['computer_transcript'])

In [None]:
y_proba

In [None]:
text_clf.classes_

In [None]:
prob_advertising = [p[0] for p in y_proba]
prob_legal = [p[1] for p in y_proba]

In [None]:
df_test['prob_advertising'] = prob_advertising
df_test['prob_legal'] = prob_legal
df_test['y-pred'] = y_pred

In [None]:
y_adj = []

for p in prob_advertising:
    if p >= .6:
        y_adj.append("Advertising")
    else:
        y_adj.append("Legal/Court")
    
df_test['y_adj'] = y_adj

In [None]:
df_test[['category', 'y-pred', 'y_adj', 'prob_advertising', 'prob_legal']]

In [None]:
y_pred = text_clf.predict(df_test['computer_transcript'])

In [None]:
# everything below this line will vary based on the run

In [None]:
precision, recall, fscore, train_support = precision_recall_fscore_support(df_test['category'], y_pred, pos_label='Advertising', average='binary')
print('Precision: {} / Recall: {} / Accuracy: {} / fscore: {}'.format(
    round(precision, 3), round(recall, 3), (round((y_pred==df_test['category']).sum()/len(y_adj), 3)), round(fscore, 3)))

In [None]:
print('precision', 8/9)
print('accuracy', 8/13)
print('recall', 13/13)

In [None]:
print('f-score', 2 * ((precision * recall) / (precision + recall)))