In [609]:
import pandas as pd
from pandasql import sqldf
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [610]:
pd.set_option('display.max_colwidth', 70)
pd.set_option('display.max_rows', None)

In [611]:
pysqldf = lambda q: sqldf(q, globals())

In [612]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [613]:
# setting a couple of workbook variables
text_source = "transcript"
tag = "lawsuit"

In [614]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [615]:
df_human = pd.read_csv("data/classifications.csv")
df_ml = pd.read_csv("data/cleaned_data.csv")

In [616]:
df_ml.head()

Unnamed: 0,ID,title,runtime,transcript,stemmed
0,ffxh0257,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record took lunch break mrs dilber talking exhibit number page num...,record took lunch break dilber talk exhibit number page number cou...
1,fghb0039,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain job kne...,get pretti late hit trail snoop around live like ain job knew kind...
2,fgxh0257,"Deposition of Mark Trudeau, President and CEO",0:52:40,back record questions witness concludes trudeau composition thank ...,back record question wit conclud trudeau composit thank sir thank ...
3,fhfk0146,CNN Crossfire debate on smoking in public,0:26:20,tonight washington crossfire saturday smoking skies ifs ands butts...,tonight washington crossfir saturday smoke sky and butt left part ...
4,fhgb0191,Think. Don't Smoke. ad campaign compilation,0:34:02,hey guys may ask couple questions ever tried cigarettes know reall...,hey guy may ask coupl question ever tri cigarett know realli much ...


In [617]:
df_human.head()

Unnamed: 0,ID,description,subject,title,runtime
0,ffhb0039,Dan Chenowetch is interviewed about why he smokes and where he has...,tobacco; cigarette; secondhand smoke; addiction; bans; flavors; br...,Interview with smoker Dan Chenowhich,0:20:41
1,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17
2,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51
3,fgxh0257,Deposition of Mark Trudeau in Re: State of Rhode Island v Purdue P...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Mark Trudeau, President and CEO",0:52:40
4,fhfk0146,CNN debate on the ban of smoking on Airplanes. Ban is put in place...,tobacco; cigarette;secondhand smoke; closed environment;,CNN Crossfire debate on smoking in public,0:26:20


In [618]:
# Todo? do we want to denormalize this, or get a list of distinct subjects

In [619]:
df_dataset = pysqldf(f"""
SELECT dh.*, dml.{text_source}
FROM 
    df_human dh
JOIN
    df_ml dml
ON dh.ID = dml.id
    WHERE description is not NULL
    AND description != 'error code 224003'
    AND subject NOT LIKE '%href%'
""")

In [620]:
len(df_dataset)

324

In [621]:
df_dataset.head(2)

Unnamed: 0,ID,description,subject,title,runtime,transcript
0,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record took lunch break mrs dilber talking exhibit number page num...
1,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain job kne...


In [622]:
#df_dataset.set_index("ID", inplace=True)

In [623]:
# what is this doing and can i use python instead? i can't figure out how to
# selecting ID, subject, transcript
# adding category column when subject includes wanted tag, then 1, else 0

# df = pysqldf(f"""
# SELECT 
#     ID,
#     subject,  
#     {text_source},
#     CASE
#         WHEN subject LIKE '%lawsuit%' THEN 1
#         ELSE 0
#     END AS category
# FROM df_dataset
# --WHERE subject LIKE '%lawsuit%'
# """)

df = df_dataset[['ID', 'subject', 'transcript']]
category = []
for row in df['subject']:
    if tag in row:
        category.append(1)
    else:
        category.append(0)
df['category'] = category
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'] = category


Unnamed: 0,ID,subject,transcript,category
0,ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,record took lunch break mrs dilber talking exhibit number page num...,1
1,fghb0039,tobacco; cigarette; marketing;,getting pretty late hit trail snoop around living like ain job kne...,0
2,fgxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,back record questions witness concludes trudeau composition thank ...,1
3,fhfk0146,tobacco; cigarette;secondhand smoke; closed environment;,tonight washington crossfire saturday smoking skies ifs ands butts...,0
4,fhgb0191,tobacco; cigarette; youth access; PSA; children; future;,hey guys may ask couple questions ever tried cigarettes know reall...,0
5,fhhb0039,tobacco; cigarette; advertise; appeal;,hello doug sarah mcclurkin think time program talked know one ciga...,0
6,fhhh0100,tobacco; cigarette; break; relief;,ever heard man literally walk wall human fly well looking hundred ...,0
7,fhhj0223,tobacco; cigarette; youth; regulation;,top story tonight indiana retail associations joining forces keep ...,0
8,fhwm0256,opioids; coupon program; efficacy; sales and marketing;,tell remember detail happened iraq benefits drug used force rememb...,0
9,fjhb0039,tobacco; cigarette;quality; advertising;,welcome winston bull comedy break guys gonna right watching gerald...,0


In [624]:
df.set_index("ID", inplace=True)

In [625]:
df.head()

Unnamed: 0_level_0,subject,transcript,category
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,record took lunch break mrs dilber talking exhibit number page num...,1
fghb0039,tobacco; cigarette; marketing;,getting pretty late hit trail snoop around living like ain job kne...,0
fgxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,back record questions witness concludes trudeau composition thank ...,1
fhfk0146,tobacco; cigarette;secondhand smoke; closed environment;,tonight washington crossfire saturday smoking skies ifs ands butts...,0
fhgb0191,tobacco; cigarette; youth access; PSA; children; future;,hey guys may ask couple questions ever tried cigarettes know reall...,0


In [626]:
# number of videos that have chosen tag
(df.category == 1).sum()

26

In [627]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [628]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=stop)),
    ('clf', RandomForestClassifier())
    ])

In [629]:
X_train = text_clf['vect'].fit_transform(df_train[text_source])

In [630]:
text_clf['vect'].get_feature_names_out()

array(['aaron', 'abandon', 'abandoned', ..., 'zones', 'zoom', 'zooming'],
      dtype=object)

In [631]:
#X_train.todense()

In [632]:
df_vector = pd.DataFrame(X_train.todense())
df_vector.columns = text_clf['vect'].get_feature_names_out()

In [633]:
pd.set_option('display.max_rows', None)
#df_vector.T

In [634]:
#df_train.iloc[0][text_source]

In [635]:
X_train = text_clf.fit(df_train[text_source], df_train['category'])

In [636]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names_out(), 
                           'importance': text_clf['clf'].feature_importances_})

In [637]:
pysqldf("""
SELECT 
    *
FROM
    feature_df
ORDER BY
    importance DESC
""").head(10)

Unnamed: 0,feature,importance
0,court,0.01742
1,mallinckrodt,0.015359
2,opioids,0.014592
3,exhibit,0.013273
4,record,0.012141
5,witness,0.010571
6,document,0.009929
7,form,0.009564
8,referring,0.009561
9,discussed,0.00939


In [638]:
feature_df.sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
2567,court,0.01742029
6568,mallinckrodt,0.01535867
7516,opioids,0.01459191
3930,exhibit,0.01327265
8898,record,0.01214143
12131,witness,0.01057135
3310,document,0.009929243
4401,form,0.009564366
8955,referring,0.009560582
3198,discussed,0.009390057


In [639]:
cross_val = cross_val_score(text_clf, df_test[text_source], df_test['category'], cv=4)

In [640]:
cross_val

array([0.95121951, 0.97560976, 0.95      , 0.975     ])

In [641]:
y_proba = text_clf.predict_proba(df_test[text_source])
y_pred = text_clf.predict(df_test[text_source])

In [642]:
y_proba

array([[1.  , 0.  ],
       [0.94, 0.06],
       [0.94, 0.06],
       [0.8 , 0.2 ],
       [0.97, 0.03],
       [0.27, 0.73],
       [0.99, 0.01],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.95, 0.05],
       [0.98, 0.02],
       [0.97, 0.03],
       [0.96, 0.04],
       [0.95, 0.05],
       [1.  , 0.  ],
       [0.8 , 0.2 ],
       [0.62, 0.38],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.99, 0.01],
       [0.93, 0.07],
       [0.92, 0.08],
       [0.96, 0.04],
       [0.8 , 0.2 ],
       [1.  , 0.  ],
       [0.96, 0.04],
       [0.99, 0.01],
       [0.99, 0.01],
       [0.98, 0.02],
       [0.82, 0.18],
       [1.  , 0.  ],
       [0.97, 0.03],
       [1.  , 0.  ],
       [0.96, 0.04],
       [0.97, 0.03],
       [0.94, 0.06],
       [0.94, 0.06],
       [0.99, 0.01],
       [0.97, 0.03],
       [0.83, 0.17],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.91, 0.09],
       [0.91, 0.09],
       [0.81, 0.19],
       [1.  , 0.  ],
       [1.  ,

In [643]:
text_clf.classes_

array([0, 1])

In [644]:
prob_no_match = [p[0] for p in y_proba]
prob_match = [p[1] for p in y_proba]

In [645]:
df_test['prob_no_match'] = prob_no_match
df_test['prob_match'] = prob_match
df_test['y_pred'] = y_pred

In [646]:
# rename y_match?
y_adj = []

for p in prob_match:
    if p <= .5:
        y_adj.append("Not")
    else:
        y_adj.append("Match")
    
df_test['y_adj'] = y_adj

In [647]:
df_test[['category', 'y_pred', 'y_adj', 'prob_no_match', 'prob_match']]

Unnamed: 0_level_0,category,y_pred,y_adj,prob_no_match,prob_match
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
zhgb0191,0,0,Not,1.0,0.0
zxxb0079,0,0,Not,0.94,0.06
zgfk0146,0,0,Not,0.94,0.06
frgp0190,0,0,Not,0.8,0.2
zjhv0183,0,0,Not,0.97,0.03
zzhh0257,1,1,Match,0.27,0.73
zjlw0036,0,0,Not,0.99,0.01
jpgl0191,0,0,Not,1.0,0.0
xrfp0190,0,0,Not,1.0,0.0
gxgb0191,0,0,Not,1.0,0.0


In [648]:
pysqldf("""SELECT * FROM df_test WHERE `y_pred` != category""")

Unnamed: 0,ID,subject,transcript,category,prob_no_match,prob_match,y_pred,y_adj
0,jxvv0231,tobacco; cigarette; hearing; advertising; nicotine; e-cigarette; v...,testing testing test test test thank you testing testing testing t...,0,0.43,0.57,1,Match
1,zygl0191,tobacco; cigarette; legal activity; lawsuit; testimony,going show what marked plaintiffs exhibit simply package pall mall...,1,0.97,0.03,0,Not
2,zthh0257,lawsuit; legal activity; litigation; opioids; sales; marketing,back record going mark next exhibit car daddy exhibit exhibit reas...,1,0.57,0.43,0,Not
3,xxmm0006,car manufacturer; lawsuit; product recall; defect,welcome larry king live tonight complains makers bad products sett...,1,0.78,0.22,0,Not


In [649]:
pysqldf(f"""
SELECT df_test.*, df.{text_source}, df.category
FROM df_test
JOIN df
ON df_test.ID = df.ID
""")

Unnamed: 0,ID,subject,transcript,category,prob_no_match,prob_match,y_pred,y_adj,transcript.1,category.1
0,zhgb0191,tobacco; cigarette; commerical; advertisement; targeted audience,fort griffin quarter million longhorns pass every year trail north...,0,1.0,0.0,0,Not,fort griffin quarter million longhorns pass every year trail north...,0
1,zhgb0191,tobacco; cigarette; commerical; advertisement; targeted audience,fort griffin quarter million longhorns pass every year trail north...,0,1.0,0.0,0,Not,fort griffin quarter million longhorns pass every year trail north...,0
2,zhgb0191,tobacco; cigarette; commerical; advertisement; targeted audience,fort griffin quarter million longhorns pass every year trail north...,0,1.0,0.0,0,Not,fort griffin quarter million longhorns pass every year trail north...,0
3,zhgb0191,tobacco; cigarette; commerical; advertisement; targeted audience,fort griffin quarter million longhorns pass every year trail north...,0,1.0,0.0,0,Not,fort griffin quarter million longhorns pass every year trail north...,0
4,zxxb0079,tobacco; cigarette; company logos; children; research; study,simpler task products logos include measure exactly like national ...,0,0.94,0.06,0,Not,simpler task products logos include measure exactly like national ...,0
5,zgfk0146,tobacco; cigarette; news; debate,question question guess protect unborn children know wait line sto...,0,0.94,0.06,0,Not,question question guess protect unborn children know wait line sto...,0
6,frgp0190,tobacco; cigarette; advertise; health;,coleman serious charges raised floor senate even tobacco publicati...,0,0.8,0.2,0,Not,coleman serious charges raised floor senate even tobacco publicati...,0
7,zjhv0183,tobacco; cigarette,big may ask couple questions think tried people always think liste...,0,0.97,0.03,0,Not,big may ask couple questions think tried people always think liste...,0
8,zzhh0257,lawsuit; legal activity; litigation; opioids; sales; marketing,funeral previously think asked testified went pharmacies talk phar...,1,0.27,0.73,1,Match,funeral previously think asked testified went pharmacies talk phar...,1
9,zjlw0036,tobacco; cigarette; advertisement; marketing; social; promotion,united states undergoing demographic transition becoming multicult...,0,0.99,0.01,0,Not,united states undergoing demographic transition becoming multicult...,0


In [650]:
y_pred = text_clf.predict(df_test[text_source])

In [651]:
# everything below this line will vary based on the run

In [652]:
precision, recall, fscore, train_support = precision_recall_fscore_support(df_test['category'], y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {} / fscore: {}'.format(
    round(precision, 3), round(recall, 3), (round((y_pred==df_test['category']).sum()/len(y_adj), 3)), round(fscore, 3)))

Precision: 0.833 / Recall: 0.625 / Accuracy: 0.975 / fscore: 0.714


In [653]:
print('f-score', 2 * ((precision * recall) / (precision + recall)))

f-score 0.7142857142857143
