In [1]:
import pandas as pd
from pandasql import sqldf
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
pd.set_option('display.max_colwidth', 70)
pd.set_option('display.max_rows', None)

In [3]:
pysqldf = lambda q: sqldf(q, globals())

In [4]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [5]:
# setting a couple of workbook variables
text_source = "transcript"
tag = "tobacco"

In [6]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [7]:
df_human = pd.read_csv("data/classifications.csv")
df_ml = pd.read_csv("data/transcript_accuracy.csv")

In [8]:
df_ml.head()

Unnamed: 0,ID,transcript,avg(confidence),min(confidence),max(confidence)
0,ffbn0006,and we're back now at 7:30 on a Tuesday Morning Joe garagiola I'm ...,0.806872,0.730015,0.907541
1,ffcn0006,the results of a new study on public smoking released by The Tobac...,0.813911,0.772248,0.871406
2,ffcw0111,at the Virginia Slims championships in Washington Andrea Yeager de...,0.819386,0.622414,0.912839
3,ffdn0006,this is World News Tonight with Peter Jennings sitting in tonight ...,0.853609,0.776043,0.912838
4,fffc0072,thank you very much for coming here today my name is Victor hanim ...,0.839049,0.677873,0.912839


In [9]:
df_human.head(1)

Unnamed: 0,ID,description,subject,title,runtime
0,ffhb0039,Dan Chenowetch is interviewed about why he smokes and where he has...,tobacco; cigarette; secondhand smoke; addiction; bans; flavors; br...,Interview with smoker Dan Chenowhich,0:20:41


In [10]:
# Todo? do we want to denormalize this, or get a list of distinct subjects

In [81]:
df_dataset = pysqldf(f"""
SELECT dh.*, dml.{text_source}, dml.`avg(confidence)` as avg_confidence
FROM 
    df_human dh
JOIN
    df_ml dml
ON dh.ID = dml.id
    WHERE description is not NULL
    AND description != 'error code 224003'
""")

In [49]:
len(df_dataset)

341

In [50]:
df_dataset.head(2)

Unnamed: 0,ID,description,subject,title,runtime,transcript,avg(confidence)
0,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,"on the record at 12:49 p.m., okay before we took a lunch break mrs...",0.852022
1,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51,it's getting pretty late when I hit the trail when you Snoop aroun...,0.875157


In [14]:
#df_dataset.set_index("ID", inplace=True)

In [83]:
df = pysqldf(f"""
SELECT 
    ID,
    subject,  
    {text_source},
    avg_confidence,
    CASE
        WHEN subject LIKE '%{tag}%' THEN 1
        ELSE 0
    END AS category
FROM df_dataset
""")

In [84]:
df.set_index("ID", inplace=True)

In [85]:
df.head(1)

Unnamed: 0_level_0,subject,transcript,avg_confidence,category
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,"on the record at 12:49 p.m., okay before we took a lunch break mrs...",0.852022,0


In [86]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [87]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=stop)),
    ('clf', RandomForestClassifier())
    ])

In [88]:
X_train = text_clf['vect'].fit_transform(df_train[text_source])

In [89]:
text_clf['vect'].get_feature_names_out()

array(['00', '000', '001', ..., 'zooming', 'zr', 'zucker'], dtype=object)

In [90]:
#X_train.todense()

In [91]:
df_vector = pd.DataFrame(X_train.todense())
df_vector.columns = text_clf['vect'].get_feature_names_out()

In [92]:
pd.set_option('display.max_rows', None)
#df_vector.T

In [93]:
df_train.iloc[0][text_source]

"read your role of the Dodgers is an expert fisherman and takes every opportunity he can the go fishing and part of his equipment is a pack of camels he tried many different brands and has this to say about camels no other cigarette can compare with camels from mildness or flavor smoke camels for 30 days through steady smoking you'll discover how mild and good-tasting camels are you'll see why after all the mildness tests camel is America's most popular cigarette, by billions, after a game Carl furillo of the Dodgers looks for his pretty wife and the mile cigarette a camel of course Carl tried many different brands before he decided on camels as his steady smoke it was the sensible 30-day camel test that really convinced him just how mild a cigarette can be follow Carl's lead smoke camels yourself you'll see why after all the mildness tests camel is America's most popular cigarette by billions,outfielder Hank Bauer of the New York Yankees likes to take life easy when he's not playing b

In [94]:
X_train = text_clf.fit(df_train[text_source], df_train['category'])

In [95]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names_out(), 
                           'importance': text_clf['clf'].feature_importances_})

In [96]:
pysqldf("""
SELECT 
    *
FROM
    feature_df
ORDER BY
    importance DESC
""").head(10)

Unnamed: 0,feature,importance
0,oxy,0.018747
1,opioids,0.018512
2,tennessee,0.016335
3,reports,0.013266
4,aside,0.013135
5,patient,0.012681
6,correctly,0.01122
7,form,0.011149
8,patients,0.011131
9,distributor,0.010663


In [97]:
feature_df.sort_values(by=['importance'], ascending=False)[:20]

Unnamed: 0,feature,importance
8833,oxy,0.018747
8671,opioids,0.018512
12407,tennessee,0.016335
10402,reports,0.013266
1109,aside,0.013135
8990,patient,0.012681
3082,correctly,0.01122
5168,form,0.011149
8991,patients,0.011131
3933,distributor,0.010663


In [98]:
cross_val = cross_val_score(text_clf, df_test[text_source], df_test['category'], cv=4)

In [99]:
cross_val

array([0.90697674, 0.88372093, 0.93023256, 0.88095238])

In [100]:
y_proba = text_clf.predict_proba(df_test[text_source])
y_pred = text_clf.predict(df_test[text_source])

In [101]:
# only list the first few, we'll build a dataframe on this later anyway
y_proba[:10]

array([[0.01, 0.99],
       [0.17, 0.83],
       [0.  , 1.  ],
       [0.01, 0.99],
       [0.06, 0.94],
       [0.02, 0.98],
       [0.07, 0.93],
       [0.04, 0.96],
       [0.04, 0.96],
       [0.03, 0.97]])

In [102]:
text_clf.classes_

array([0, 1])

In [103]:
prob_not = [p[0] for p in y_proba]
prob_legal = [p[1] for p in y_proba]

In [104]:
df_test['prob_no_match'] = prob_not
df_test['prob_match'] = prob_legal
df_test['y-pred'] = y_pred

In [105]:
threshold = 0.5

y_adj = []

for p in prob_legal:
    if p <= threshold:
        y_adj.append(0)
    else:
        y_adj.append(1)
    
df_test['y_adj'] = y_adj

In [106]:
df_test[['category', 'y-pred', 'y_adj', 'prob_no_match', 'prob_match']].head()

Unnamed: 0_level_0,category,y-pred,y_adj,prob_no_match,prob_match
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
fypj0015,1,1,1,0.01,0.99
jhdc0072,1,1,1,0.17,0.83
jrcf0191,1,1,1,0.0,1.0
yrhb0039,1,1,1,0.01,0.99
fpnh0063,1,1,1,0.06,0.94


In [107]:
tag_results = pysqldf(f"""
SELECT df_test.*, df.{text_source}, df.category, df.avg_confidence
FROM df_test
JOIN df
ON df_test.ID = df.ID
""")

In [108]:
tag_results.head()

Unnamed: 0,ID,subject,transcript,avg_confidence,category,prob_no_match,prob_match,y-pred,y_adj,transcript.1,category.1,avg_confidence.1
0,fypj0015,tobacco; cigarette; glycerine; health; filter;,imagine a cigarette with smoke that disappears like this and smell...,0.849739,1,0.01,0.99,1,1,imagine a cigarette with smoke that disappears like this and smell...,1,0.849739
1,jhdc0072,tobacco; cigarette; ad;,there's the baron and we're going to show you a couple of things y...,0.827533,1,0.17,0.83,1,1,there's the baron and we're going to show you a couple of things y...,1,0.827533
2,jrcf0191,tobacco; cigarette; fresh;,as he stepped from his long limousine said Sir Arthur Smith Beverl...,0.912838,1,0.0,1.0,1,1,as he stepped from his long limousine said Sir Arthur Smith Beverl...,1,0.912838
3,yrhb0039,tobacco; cigarette; training; sales,"welcome to the retail conversion sales team, the objective of this...",0.856988,1,0.01,0.99,1,1,"welcome to the retail conversion sales team, the objective of this...",1,0.856988
4,fpnh0063,tobacco; cigarette; health; cost; profit;,ladies and Gentlemen please welcome the director of Market plannin...,0.861348,1,0.06,0.94,1,1,ladies and Gentlemen please welcome the director of Market plannin...,1,0.861348


In [114]:
# let's look into where the model missed it
df_missed = pysqldf("""
SELECT 
    *
FROM
    tag_results
WHERE 
    category != `y_adj`
""")

In [116]:
df_accurate = pysqldf("""
SELECT 
    *
FROM
    tag_results
WHERE 
    category == `y_adj`
""")

In [123]:
df_missed

Unnamed: 0,ID,subject,transcript,avg_confidence,category,prob_no_match,prob_match,y-pred,y_adj
0,xrgp0190,alcohol; commerical; advertisement; promotion,"vinyl horse for this year's hits team is thunder thunder, maybe ne...",0.793201,0,0.0,1.0,1,1
1,xhwm0256,opioids; sales and marketing; physician interview; costs,make a comment about it so the rep saying it she kind of addressed...,0.829267,0,0.07,0.93,1,1
2,fhwm0256,opioids; coupon program; efficacy; sales and marketing;,tell me what you remember from the detail that happened in Iraq ha...,0.764117,0,0.02,0.98,1,1
3,jgxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,you're back on the record at 141 p.m. good afternoon mr. Webb my n...,0.846447,0,0.44,0.56,1,1
4,xxmm0006,car manufacturer; lawsuit; product recall; defect,welcome to Larry King live tonight when complains against the make...,0.830116,0,0.13,0.87,1,1
5,jzhh0257,lawsuit; legal activity; litigation; opioids; sales; marketing,against media number three and the deposition of tithing time on t...,0.842928,0,0.35,0.65,1,1
6,zgwm0256,opioids; physician interview; sales and marketing,"okay just a quick look at the you saw this yesterday as well, at t...",0.851899,0,0.03,0.97,1,1
7,ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,"on the record at 12:49 p.m., okay before we took a lunch break mrs...",0.852022,0,0.43,0.57,1,1
8,trfp0190,news; fundraising,for over a quarter of a century despite the technical challenges a...,0.846776,0,0.02,0.98,1,1
9,ghwm0256,opioids; interview; physician; sales and marketing,let's address the cost issue for a couple of moments she did talk ...,0.79117,0,0.02,0.98,1,1


In [121]:
df_accurate.describe()['avg_confidence']

count    157.000000
mean       0.837999
std        0.054845
min        0.358044
25%        0.824442
50%        0.842276
75%        0.860647
max        0.912839
Name: avg_confidence, dtype: float64

In [122]:
df_missed.describe()['avg_confidence']

count    16.000000
mean      0.818379
std       0.034040
min       0.739904
25%       0.793898
50%       0.829692
75%       0.846529
max       0.852022
Name: avg_confidence, dtype: float64

In [110]:
y_pred = text_clf.predict(df_test[text_source])

In [111]:
# everything below this line will vary based on the run

In [112]:
precision, recall, fscore, train_support = precision_recall_fscore_support(df_test['category'], y_adj, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {} / fscore: {}'.format(
    round(precision, 3), round(recall, 3), (round((y_pred==df_test['category']).sum()/len(y_adj), 3)), round(fscore, 3)))

Precision: 0.904 / Recall: 1.0 / Accuracy: 0.906 / fscore: 0.95


In [113]:
print('f-score', 2 * ((precision * recall) / (precision + recall)))

f-score 0.949685534591195
