In [47]:
import pandas as pd
from pandasql import sqldf
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [48]:
pd.set_option('display.max_colwidth', 70)
pd.set_option('display.max_rows', None)

In [49]:
pysqldf = lambda q: sqldf(q, globals())

In [50]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [51]:
# setting a couple of workbook variables
text_source = "transcript"
tag = "advertising"

In [52]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [53]:
df_human = pd.read_csv("data/classifications.csv")
df_ml = pd.read_csv("data/cleaned_data.csv")

In [54]:
df_ml.head()

Unnamed: 0,ID,title,runtime,transcript,stemmed
0,ffxh0257,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record took lunch break mrs dilber talking exhibit number page num...,record took lunch break dilber talk exhibit number page number cou...
1,fghb0039,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain job kne...,get pretti late hit trail snoop around live like ain job knew kind...
2,fgxh0257,"Deposition of Mark Trudeau, President and CEO",0:52:40,back record questions witness concludes trudeau composition thank ...,back record question wit conclud trudeau composit thank sir thank ...
3,fhfk0146,CNN Crossfire debate on smoking in public,0:26:20,tonight washington crossfire saturday smoking skies ifs ands butts...,tonight washington crossfir saturday smoke sky and butt left part ...
4,fhgb0191,Think. Don't Smoke. ad campaign compilation,0:34:02,hey guys may ask couple questions ever tried cigarettes know reall...,hey guy may ask coupl question ever tri cigarett know realli much ...


In [55]:
df_human.head()

Unnamed: 0,ID,description,subject,title,runtime
0,ffhb0039,Dan Chenowetch is interviewed about why he smokes and where he has...,tobacco; cigarette; secondhand smoke; addiction; bans; flavors; br...,Interview with smoker Dan Chenowhich,0:20:41
1,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17
2,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51
3,fgxh0257,Deposition of Mark Trudeau in Re: State of Rhode Island v Purdue P...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Mark Trudeau, President and CEO",0:52:40
4,fhfk0146,CNN debate on the ban of smoking on Airplanes. Ban is put in place...,tobacco; cigarette;secondhand smoke; closed environment;,CNN Crossfire debate on smoking in public,0:26:20


In [56]:
# Todo? do we want to denormalize this, or get a list of distinct subjects

In [57]:
df_dataset = pysqldf(f"""
SELECT dh.*, dml.{text_source}
FROM 
    df_human dh
JOIN
    df_ml dml
ON dh.ID = dml.id
    WHERE description is not NULL
    AND description != 'error code 224003'
    AND subject NOT LIKE '%href%'
""")

In [58]:
len(df_dataset)

324

In [59]:
df_dataset.head(2)

Unnamed: 0,ID,description,subject,title,runtime,transcript
0,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record took lunch break mrs dilber talking exhibit number page num...
1,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain job kne...


In [60]:
#df_dataset.set_index("ID", inplace=True)

In [61]:
# what is this doing and can i use python instead? i can't figure out how to
# selecting ID, subject, transcript
# adding category column when subject includes wanted tag, then 1, else 0

# df = pysqldf(f"""
# SELECT 
#     ID,
#     subject,  
#     {text_source},
#     CASE
#         WHEN subject LIKE '%lawsuit%' THEN 1
#         ELSE 0
#     END AS category
# FROM df_dataset
# --WHERE subject LIKE '%lawsuit%'
# """)

df = df_dataset[['ID', 'subject', 'transcript']]
category = []
for row in df['subject']:
    if tag in row:
        category.append(1)
    else:
        category.append(0)
df['category'] = category
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'] = category


Unnamed: 0,ID,subject,transcript,category
0,ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,record took lunch break mrs dilber talking exhibit number page num...,0
1,fghb0039,tobacco; cigarette; marketing;,getting pretty late hit trail snoop around living like ain job kne...,0
2,fgxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,back record questions witness concludes trudeau composition thank ...,0
3,fhfk0146,tobacco; cigarette;secondhand smoke; closed environment;,tonight washington crossfire saturday smoking skies ifs ands butts...,0
4,fhgb0191,tobacco; cigarette; youth access; PSA; children; future;,hey guys may ask couple questions ever tried cigarettes know reall...,0
5,fhhb0039,tobacco; cigarette; advertise; appeal;,hello doug sarah mcclurkin think time program talked know one ciga...,0
6,fhhh0100,tobacco; cigarette; break; relief;,ever heard man literally walk wall human fly well looking hundred ...,0
7,fhhj0223,tobacco; cigarette; youth; regulation;,top story tonight indiana retail associations joining forces keep ...,0
8,fhwm0256,opioids; coupon program; efficacy; sales and marketing;,tell remember detail happened iraq benefits drug used force rememb...,0
9,fjhb0039,tobacco; cigarette;quality; advertising;,welcome winston bull comedy break guys gonna right watching gerald...,1


In [62]:
df.set_index("ID", inplace=True)

In [63]:
df.head()

Unnamed: 0_level_0,subject,transcript,category
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,record took lunch break mrs dilber talking exhibit number page num...,0
fghb0039,tobacco; cigarette; marketing;,getting pretty late hit trail snoop around living like ain job kne...,0
fgxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,back record questions witness concludes trudeau composition thank ...,0
fhfk0146,tobacco; cigarette;secondhand smoke; closed environment;,tonight washington crossfire saturday smoking skies ifs ands butts...,0
fhgb0191,tobacco; cigarette; youth access; PSA; children; future;,hey guys may ask couple questions ever tried cigarettes know reall...,0


In [64]:
# number of videos that have chosen tag
(df.category == 1).sum()

23

In [65]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [66]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=stop)),
    ('clf', RandomForestClassifier())
    ])

In [67]:
X_train = text_clf['vect'].fit_transform(df_train[text_source])

In [68]:
text_clf['vect'].get_feature_names_out()

array(['aaron', 'abandon', 'abandoned', ..., 'zone', 'zooming', 'zucker'],
      dtype=object)

In [69]:
#X_train.todense()

In [70]:
df_vector = pd.DataFrame(X_train.todense())
df_vector.columns = text_clf['vect'].get_feature_names_out()

In [71]:
pd.set_option('display.max_rows', None)
#df_vector.T

In [72]:
#df_train.iloc[0][text_source]

In [73]:
X_train = text_clf.fit(df_train[text_source], df_train['category'])

In [74]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names_out(), 
                           'importance': text_clf['clf'].feature_importances_})

In [75]:
pysqldf("""
SELECT 
    *
FROM
    feature_df
ORDER BY
    importance DESC
""").head(10)

Unnamed: 0,feature,importance
0,forget,0.015912
1,right,0.011946
2,merry,0.010774
3,monday,0.010456
4,guest,0.010166
5,dumb,0.00993
6,usa,0.009216
7,time,0.008465
8,abroad,0.007657
9,gift,0.007287


In [76]:
feature_df.sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
4715,forget,0.01591233
9847,right,0.01194566
7246,merry,0.01077401
7445,monday,0.01045562
5226,guest,0.01016578
3680,dumb,0.009930017
12289,usa,0.009216353
11765,time,0.008464525
24,abroad,0.007656947
4999,gift,0.007287062


In [77]:
cross_val = cross_val_score(text_clf, df_test[text_source], df_test['category'], cv=4)

In [78]:
cross_val

array([0.92682927, 0.90243902, 0.85      , 0.925     ])

In [79]:
y_proba = text_clf.predict_proba(df_test[text_source])
y_pred = text_clf.predict(df_test[text_source])

In [80]:
y_proba

array([[0.9 , 0.1 ],
       [0.93, 0.07],
       [0.95, 0.05],
       [1.  , 0.  ],
       [0.91, 0.09],
       [0.92, 0.08],
       [0.92, 0.08],
       [0.91, 0.09],
       [0.97, 0.03],
       [0.88, 0.12],
       [0.94, 0.06],
       [0.94, 0.06],
       [0.93, 0.07],
       [0.94, 0.06],
       [0.94, 0.06],
       [0.92, 0.08],
       [0.99, 0.01],
       [0.96, 0.04],
       [0.98, 0.02],
       [0.94, 0.06],
       [0.93, 0.07],
       [0.98, 0.02],
       [0.94, 0.06],
       [0.76, 0.24],
       [0.99, 0.01],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.92, 0.08],
       [0.96, 0.04],
       [0.94, 0.06],
       [0.88, 0.12],
       [0.97, 0.03],
       [0.91, 0.09],
       [1.  , 0.  ],
       [0.96, 0.04],
       [0.96, 0.04],
       [0.97, 0.03],
       [0.85, 0.15],
       [1.  , 0.  ],
       [0.91, 0.09],
       [0.88, 0.12],
       [0.96, 0.04],
       [0.94, 0.06],
       [0.97, 0.03],
       [0.94, 0.06],
       [0.94, 0.06],
       [0.96, 0.04],
       [0.98,

In [81]:
text_clf.classes_

array([0, 1])

In [82]:
prob_no_match = [p[0] for p in y_proba]
prob_match = [p[1] for p in y_proba]

In [83]:
df_test['prob_no_match'] = prob_no_match
df_test['prob_match'] = prob_match
df_test['y_pred'] = y_pred

In [84]:
# rename y_match?
y_adj = []

for p in prob_match:
    if p <= .5:
        y_adj.append("Not")
    else:
        y_adj.append("Match")
    
df_test['y_adj'] = y_adj

In [85]:
df_test[['category', 'y_pred', 'y_adj', 'prob_no_match', 'prob_match']]

Unnamed: 0_level_0,category,y_pred,y_adj,prob_no_match,prob_match
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
xzgb0039,0,0,Not,0.9,0.1
jqgl0191,0,0,Not,0.93,0.07
ytgb0039,0,0,Not,0.95,0.05
jtgp0190,0,0,Not,1.0,0.0
yqyg0114,0,0,Not,0.91,0.09
ggxh0257,0,0,Not,0.92,0.08
xsbg0182,0,0,Not,0.92,0.08
yrhb0039,0,0,Not,0.91,0.09
yghb0039,0,0,Not,0.97,0.03
ygvp0038,0,0,Not,0.88,0.12


In [86]:
pysqldf("""SELECT * FROM df_test WHERE `y_pred` != category""")

Unnamed: 0,ID,subject,transcript,category,prob_no_match,prob_match,y_pred,y_adj
0,ynhb0039,tobacco; cigarette; advertising; new product,adorable barbara husband built things crazy putting tell well worl...,1,0.94,0.06,0,Not
1,ztxw0178,tobacco; cigarette; debate; television; youth; women; marketing; a...,western australia new much specific health warnings tested cigaret...,1,0.96,0.04,0,Not
2,ympj0015,tobacco; cigarette; advertising; new product,smoke mostly water glycerin there almost smoke lid wonder less sec...,1,0.94,0.06,0,Not
3,ynpj0015,tobacco; cigarette; advertising; new product,adorable barbara husband bill thinks crazy putting tell hey world ...,1,0.88,0.12,0,Not
4,xxhb0039,tobacco; cigarette; advertising; new product,adorable barbara husband bill thinks crazy putting tell well world...,1,0.96,0.04,0,Not
5,yxgb0191,tobacco; cigarette; prevention; advertising; PSA,matter big son gets matter strong gets matter smart gets keep talk...,1,0.9,0.1,0,Not
6,tmgp0190,tobacco; cigarette; promotion; advertising,well first half good neighbors brought remove three hundred head c...,1,0.97,0.03,0,Not
7,jxvv0231,tobacco; cigarette; hearing; advertising; nicotine; e-cigarette; v...,testing testing test test test thank you testing testing testing t...,1,0.85,0.15,0,Not
8,yyhb0039,tobacco; cigarette; news report; advertising; demographics; sales,know war words waged cigarettes continuing series titled smoking g...,1,0.92,0.08,0,Not
9,zfhb0039,tobacco; cigarette; advertising,getting pretty late hit trail snoop around living like ain job kne...,1,0.91,0.09,0,Not


In [87]:
pysqldf(f"""
SELECT df_test.*, df.{text_source}, df.category
FROM df_test
JOIN df
ON df_test.ID = df.ID
""")

Unnamed: 0,ID,subject,transcript,category,prob_no_match,prob_match,y_pred,y_adj,transcript.1,category.1
0,xzgb0039,tobacco; cigarette; report; conference; study,turned percentage children develop asthma significantly higher wom...,0,0.9,0.1,0,Not,turned percentage children develop asthma significantly higher wom...,0
1,jqgl0191,tobacco; cigarette; tar; flavor;,year history west point federal government released annual report ...,0,0.93,0.07,0,Not,year history west point federal government released annual report ...,0
2,ytgb0039,tobacco; cigarette; research; science,top executives major cigarette companies mounted assault capitol h...,0,0.95,0.05,0,Not,top executives major cigarette companies mounted assault capitol h...,0
3,jtgp0190,tobacco; cigarette;,ingredient cigarette smoke identified injuries human health confid...,0,1.0,0.0,0,Not,ingredient cigarette smoke identified injuries human health confid...,0
4,yqyg0114,tobacco; cigarette; news; companies; politics,secondhand smoke leading environmental cause cancer nbc tom asshol...,0,0.91,0.09,0,Not,secondhand smoke leading environmental cause cancer nbc tom asshol...,0
5,ggxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,back record hey mister vorta stassi say say strasse german way kno...,0,0.92,0.08,0,Not,back record hey mister vorta stassi say say strasse german way kno...,0
6,xsbg0182,tobacco; cigarette; sales; marketing; employee video; training,prepared time psychic hotline mysterious questions future answered...,0,0.92,0.08,0,Not,prepared time psychic hotline mysterious questions future answered...,0
7,yrhb0039,tobacco; cigarette; training; sales,welcome retail conversion sales team objective sales team tell adu...,0,0.91,0.09,0,Not,welcome retail conversion sales team objective sales team tell adu...,0
8,yghb0039,tobacco; cigarette; company video; PR; marketing; prevention,keeping america beautiful means keeping community beautiful lots w...,0,0.97,0.03,0,Not,keeping america beautiful means keeping community beautiful lots w...,0
9,ygvp0038,tobacco; cigarette; advertisement; marketing; social; promotion,biggest thing happening club dancing edge nights sight club benson...,0,0.88,0.12,0,Not,biggest thing happening club dancing edge nights sight club benson...,0


In [88]:
y_pred = text_clf.predict(df_test[text_source])

In [89]:
# everything below this line will vary based on the run

In [90]:
precision, recall, fscore, train_support = precision_recall_fscore_support(df_test['category'], y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {} / fscore: {}'.format(
    round(precision, 3), round(recall, 3), (round((y_pred==df_test['category']).sum()/len(y_adj), 3)), round(fscore, 3)))

Precision: 0.0 / Recall: 0.0 / Accuracy: 0.92 / fscore: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [91]:
print('f-score', 2 * ((precision * recall) / (precision + recall)))

f-score nan


  print('f-score', 2 * ((precision * recall) / (precision + recall)))


In [92]:
# note: is 23 matching videos not enough? probably not bc only about 7%
# still null f-score when bringing threshold down to .15 or .30