In [1]:
import pandas as pd
from pandasql import sqldf
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
pd.set_option('display.max_colwidth', 70)
pd.set_option('display.max_rows', None)

In [3]:
pysqldf = lambda q: sqldf(q, globals())

In [4]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [5]:
# setting a couple of workbook variables
text_source = "transcript"
tag = "lawsuit"

In [6]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [7]:
df_human = pd.read_csv("data/classifications.csv")
df_ml = pd.read_csv("data/transcript_accuracy.csv")

In [8]:
df_ml.head()

Unnamed: 0,ID,transcript,avg(confidence),min(confidence),max(confidence)
0,ffbn0006,and we're back now at 7:30 on a Tuesday Morning Joe garagiola I'm ...,0.806872,0.730015,0.907541
1,ffcn0006,the results of a new study on public smoking released by The Tobac...,0.813911,0.772248,0.871406
2,ffcw0111,at the Virginia Slims championships in Washington Andrea Yeager de...,0.819386,0.622414,0.912839
3,ffdn0006,this is World News Tonight with Peter Jennings sitting in tonight ...,0.853609,0.776043,0.912838
4,fffc0072,thank you very much for coming here today my name is Victor hanim ...,0.839049,0.677873,0.912839


In [9]:
df_human.head(1)

Unnamed: 0,ID,description,subject,title,runtime
0,ffhb0039,Dan Chenowetch is interviewed about why he smokes and where he has...,tobacco; cigarette; secondhand smoke; addiction; bans; flavors; br...,Interview with smoker Dan Chenowhich,0:20:41


In [10]:
# Todo? do we want to denormalize this, or get a list of distinct subjects

In [11]:
df_dataset = pysqldf(f"""
SELECT dh.*, dml.{text_source}
FROM 
    df_human dh
JOIN
    df_ml dml
ON dh.ID = dml.id
    WHERE description is not NULL
    AND description != 'error code 224003'
""")

In [12]:
len(df_dataset)

341

In [13]:
df_dataset.head(2)

Unnamed: 0,ID,description,subject,title,runtime,transcript
0,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,"on the record at 12:49 p.m., okay before we took a lunch break mrs..."
1,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51,it's getting pretty late when I hit the trail when you Snoop aroun...


In [None]:
#df_dataset.set_index("ID", inplace=True)

In [15]:
df = pysqldf(f"""
SELECT 
    ID,
    subject,  
    {text_source},
    CASE
        WHEN subject LIKE '%lawsuit%' THEN 1
        ELSE 0
    END AS category
FROM df_dataset
--WHERE subject LIKE '%lawsuit%'
""")

In [16]:
df.set_index("ID", inplace=True)

In [17]:
df.head(1)

Unnamed: 0_level_0,subject,transcript,category
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,"on the record at 12:49 p.m., okay before we took a lunch break mrs...",1


In [18]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [19]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=stop)),
    ('clf', RandomForestClassifier())
    ])

In [20]:
X_train = text_clf['vect'].fit_transform(df_train[text_source])

In [21]:
text_clf['vect'].get_feature_names_out()

array(['00', '000', '001', ..., 'zoo', 'zoom', 'zucker'], dtype=object)

In [23]:
#X_train.todense()

In [24]:
df_vector = pd.DataFrame(X_train.todense())
df_vector.columns = text_clf['vect'].get_feature_names_out()

In [27]:
pd.set_option('display.max_rows', None)
#df_vector.T

In [29]:
df_train.iloc[0][text_source]

"I want to show you what I'm going to marked as exhibit number 17 which is, page numbers 3040 14 am I wrong I did yeah all right I meant 14 thank you very much, glad you know what I mean oh yeah our real time is there a way I can plug it in, now that's not going to work, don't worry about it we can go without it I don't need it, exhibit number 14, which is pain management pocket card set, you ever seen a document like this before I've never seen a document like this before, have you ever provided a document like this to a hcp not that I can recall if you look at the first page of the document where it says ask assess treat and monitor, see that yes under ask it says always ask patient about the presence of pain and accept the patient's report of pain you recall that, I don't recall any of this would that be the same thing for under if you look under monitor where it says most opioid antagonists have no analgesic ceiling dose don't recall that either you know Jesus that is a statement I

In [31]:
X_train = text_clf.fit(df_train[text_source], df_train['category'])

In [32]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names_out(), 
                           'importance': text_clf['clf'].feature_importances_})

In [34]:
pysqldf("""
SELECT 
    *
FROM
    feature_df
ORDER BY
    importance DESC
""").head(10)

Unnamed: 0,feature,importance
0,mallinckrodt,0.02987
1,email,0.011919
2,opioid,0.011712
3,wholesaler,0.011492
4,alliance,0.011362
5,recall,0.010657
6,document,0.00869
7,pharmacies,0.008537
8,junction,0.008151
9,opioids,0.007849


In [35]:
feature_df.sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
7990,mallinckrodt,0.02987033
4572,email,0.01191942
9112,opioid,0.01171181
14167,wholesaler,0.0114917
811,alliance,0.01136165
10610,recall,0.01065745
4213,document,0.008689635
9609,pharmacies,0.008536857
7284,junction,0.008150516
9113,opioids,0.007849331


In [37]:
cross_val = cross_val_score(text_clf, df_test[text_source], df_test['category'], cv=4)

In [38]:
cross_val

array([0.93023256, 0.95348837, 0.97674419, 0.92857143])

In [42]:
y_proba = text_clf.predict_proba(df_test[text_source])
y_pred = text_clf.predict(df_test[text_source])

In [43]:
y_proba

array([[0.93, 0.07],
       [0.93, 0.07],
       [1.  , 0.  ],
       [0.91, 0.09],
       [0.94, 0.06],
       [0.52, 0.48],
       [0.93, 0.07],
       [0.93, 0.07],
       [0.98, 0.02],
       [0.96, 0.04],
       [0.99, 0.01],
       [0.87, 0.13],
       [0.89, 0.11],
       [0.97, 0.03],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.98, 0.02],
       [0.97, 0.03],
       [0.99, 0.01],
       [0.39, 0.61],
       [0.97, 0.03],
       [0.98, 0.02],
       [0.97, 0.03],
       [0.97, 0.03],
       [0.77, 0.23],
       [0.97, 0.03],
       [0.73, 0.27],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.99, 0.01],
       [0.78, 0.22],
       [0.85, 0.15],
       [0.96, 0.04],
       [0.99, 0.01],
       [0.98, 0.02],
       [0.96, 0.04],
       [0.94, 0.06],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.99, 0.01],
       [0.99, 0.01],
       [0.98, 0.02],
       [0.93, 0.07],
       [0.88, 0.12],
       [0.99, 0.01],
       [0.98, 0.02],
       [0.93, 0.07],
       [0.96,

In [44]:
text_clf.classes_

array([0, 1])

In [45]:
prob_not = [p[0] for p in y_proba]
prob_legal = [p[1] for p in y_proba]

In [46]:
df_test['prob_not'] = prob_not
df_test['prob_legal'] = prob_legal
df_test['y-pred'] = y_pred

In [47]:
y_adj = []

for p in prob_legal:
    if p <= .5:
        y_adj.append("Not")
    else:
        y_adj.append("Lawsuit")
    
df_test['y_adj'] = y_adj

In [48]:
df_test[['category', 'y-pred', 'y_adj', 'prob_not', 'prob_legal']]

Unnamed: 0_level_0,category,y-pred,y_adj,prob_not,prob_legal
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tjnv0001,0,0,Not,0.93,0.07
yydn0006,0,0,Not,0.93,0.07
jphb0039,0,0,Not,1.0,0.0
xkhb0039,0,0,Not,0.91,0.09
hpgl0191,0,0,Not,0.94,0.06
ffxh0257,1,0,Not,0.52,0.48
jmmc0083,0,0,Not,0.93,0.07
xjxb0079,0,0,Not,0.93,0.07
yxxb0079,0,0,Not,0.98,0.02
zmdh0182,0,0,Not,0.96,0.04


In [50]:
pysqldf(f"""
SELECT df_test.*, df.{text_source}, df.category
FROM df_test
JOIN df
ON df_test.ID = df.ID
""")

Unnamed: 0,ID,subject,transcript,category,prob_not,prob_legal,y-pred,y_adj,transcript.1,category.1
0,tjnv0001,tobacco; cigarette; promotion; strategy,"hello Doug this is Sarah McClurkin, I think it's time to do the pr...",0,0.93,0.07,0,Not,"hello Doug this is Sarah McClurkin, I think it's time to do the pr...",0
1,yydn0006,tobacco; cigarette,okay hi I've always admitted to having one of the greatest noses o...,0,0.93,0.07,0,Not,okay hi I've always admitted to having one of the greatest noses o...,0
2,jphb0039,tobacco; cigarette; relax;,"so if we start right now, Wings bunny boy, going to join if we sta...",0,1.0,0.0,0,Not,"so if we start right now, Wings bunny boy, going to join if we sta...",0
3,xkhb0039,tobacco; cigarette; advertising; new product,ladies and gentlemen this video is about a new rgr brand entry cal...,0,0.91,0.09,0,Not,ladies and gentlemen this video is about a new rgr brand entry cal...,0
4,hpgl0191,tobacco; cigarette;<a href='http://www.archive.org/browse.php?fiel...,good morning in an hour the space shuttle Challenger and its crew ...,0,0.94,0.06,0,Not,good morning in an hour the space shuttle Challenger and its crew ...,0
5,ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,"on the record at 12:49 p.m., okay before we took a lunch break mrs...",1,0.52,0.48,0,Not,"on the record at 12:49 p.m., okay before we took a lunch break mrs...",1
6,jmmc0083,tobacco; cigarette;profit; sales; retail outlet; business outlet; ...,we do almost 200 million dollars a year in cigarette sales are con...,0,0.93,0.07,0,Not,we do almost 200 million dollars a year in cigarette sales are con...,0
7,xjxb0079,tobacco; cigarette; interview; celebrity sponsor; anti-smoking; ca...,it'll be about a minute before they actually go to you but their p...,0,0.93,0.07,0,Not,it'll be about a minute before they actually go to you but their p...,0
8,yxxb0079,tobacco; cigarette,new smokeless cigarettes have already been the focus of criticism ...,0,0.98,0.02,0,Not,new smokeless cigarettes have already been the focus of criticism ...,0
9,zmdh0182,tobacco; cigarette;,"appreciate it, what I mean by that is that we can be sometimes I g...",0,0.96,0.04,0,Not,"appreciate it, what I mean by that is that we can be sometimes I g...",0


In [52]:
y_pred = text_clf.predict(df_test[text_source])

In [None]:
# everything below this line will vary based on the run

In [53]:
precision, recall, fscore, train_support = precision_recall_fscore_support(df_test['category'], y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {} / fscore: {}'.format(
    round(precision, 3), round(recall, 3), (round((y_pred==df_test['category']).sum()/len(y_adj), 3)), round(fscore, 3)))

Precision: 1.0 / Recall: 0.385 / Accuracy: 0.953 / fscore: 0.556


In [54]:
print('f-score', 2 * ((precision * recall) / (precision + recall)))

f-score 0.5555555555555556
