In [1]:
import pandas as pd
from pandasql import sqldf
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
pd.set_option('display.max_colwidth', 70)
pd.set_option('display.max_rows', None)

In [3]:
pysqldf = lambda q: sqldf(q, globals())

In [4]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [5]:
# setting a couple of workbook variables
text_source = "transcript"
tag = "health"

In [6]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [7]:
df_human = pd.read_csv("data/classifications.csv")
df_ml = pd.read_csv("data/cleaned_data.csv")

In [8]:
df_ml.head()

Unnamed: 0,ID,title,runtime,transcript
0,ffxh0257,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record okay took lunch break mrs dilber talking exhibit number pag...
1,fghb0039,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain t job k...
2,fgxh0257,"Deposition of Mark Trudeau, President and CEO",0:52:40,back record questions witness concludes mr trudeau composition tha...
3,fhfk0146,CNN Crossfire debate on smoking in public,0:26:20,tonight washington crossfire saturday smoking skies ifs ands butts...
4,fhgb0191,Think. Don't Smoke. ad campaign compilation,0:34:02,hey guys may ask couple questions ever tried cigarettes know reall...


In [9]:
df_human.head()

Unnamed: 0,ID,description,subject,title,runtime
0,ffhb0039,Dan Chenowetch is interviewed about why he smokes and where he has...,tobacco; cigarette; secondhand smoke; addiction; bans; flavors; br...,Interview with smoker Dan Chenowhich,0:20:41
1,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17
2,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51
3,fgxh0257,Deposition of Mark Trudeau in Re: State of Rhode Island v Purdue P...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Mark Trudeau, President and CEO",0:52:40
4,fhfk0146,CNN debate on the ban of smoking on Airplanes. Ban is put in place...,tobacco; cigarette;secondhand smoke; closed environment;,CNN Crossfire debate on smoking in public,0:26:20


In [10]:
# Todo? do we want to denormalize this, or get a list of distinct subjects

In [11]:
df_dataset = pysqldf(f"""
SELECT dh.*, dml.{text_source}
FROM 
    df_human dh
JOIN
    df_ml dml
ON dh.ID = dml.id
    WHERE description is not NULL
    AND description != 'error code 224003'
    AND subject NOT LIKE '%href%'
""")

In [12]:
len(df_dataset)

324

In [13]:
df_dataset.head(2)

Unnamed: 0,ID,description,subject,title,runtime,transcript
0,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record okay took lunch break mrs dilber talking exhibit number pag...
1,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain t job k...


In [14]:
#df_dataset.set_index("ID", inplace=True)

In [15]:
# what is this doing and can i use python instead? i can't figure out how to
# selecting ID, subject, transcript
# adding category column when subject includes wanted tag, then 1, else 0

# df = pysqldf(f"""
# SELECT 
#     ID,
#     subject,  
#     {text_source},
#     CASE
#         WHEN subject LIKE '%lawsuit%' THEN 1
#         ELSE 0
#     END AS category
# FROM df_dataset
# --WHERE subject LIKE '%lawsuit%'
# """)

df = df_dataset[['ID', 'subject', 'transcript']]
category = []
for row in df['subject']:
    if tag in row:
        category.append(1)
    else:
        category.append(0)
df['category'] = category
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'] = category


Unnamed: 0,ID,subject,transcript,category
0,ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,record okay took lunch break mrs dilber talking exhibit number pag...,0
1,fghb0039,tobacco; cigarette; marketing;,getting pretty late hit trail snoop around living like ain t job k...,0
2,fgxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,back record questions witness concludes mr trudeau composition tha...,0
3,fhfk0146,tobacco; cigarette;secondhand smoke; closed environment;,tonight washington crossfire saturday smoking skies ifs ands butts...,0
4,fhgb0191,tobacco; cigarette; youth access; PSA; children; future;,hey guys may ask couple questions ever tried cigarettes know reall...,0
5,fhhb0039,tobacco; cigarette; advertise; appeal;,hello doug sarah mcclurkin think time program talked know one ciga...,0
6,fhhh0100,tobacco; cigarette; break; relief;,ever heard man literally walk wall human fly well looking hundred ...,0
7,fhhj0223,tobacco; cigarette; youth; regulation;,top story tonight indiana retail associations joining forces keep ...,0
8,fhwm0256,opioids; coupon program; efficacy; sales and marketing;,tell remember detail happened iraq benefits drug used okay force r...,0
9,fjhb0039,tobacco; cigarette;quality; advertising;,welcome winston bull comedy break guys gonna right watching gerald...,0


In [16]:
df.set_index("ID", inplace=True)

In [17]:
df.head()

Unnamed: 0_level_0,subject,transcript,category
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ffxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,record okay took lunch break mrs dilber talking exhibit number pag...,0
fghb0039,tobacco; cigarette; marketing;,getting pretty late hit trail snoop around living like ain t job k...,0
fgxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,back record questions witness concludes mr trudeau composition tha...,0
fhfk0146,tobacco; cigarette;secondhand smoke; closed environment;,tonight washington crossfire saturday smoking skies ifs ands butts...,0
fhgb0191,tobacco; cigarette; youth access; PSA; children; future;,hey guys may ask couple questions ever tried cigarettes know reall...,0


In [18]:
# number of videos that have chosen tag
(df.category == 1).sum()

30

In [19]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [20]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=stop)),
    ('clf', RandomForestClassifier())
    ])

In [21]:
X_train = text_clf['vect'].fit_transform(df_train[text_source])

In [22]:
text_clf['vect'].get_feature_names_out()

array(['aaron', 'abandon', 'abandoned', ..., 'zombie', 'zone', 'zooming'],
      dtype=object)

In [23]:
#X_train.todense()

In [24]:
df_vector = pd.DataFrame(X_train.todense())
df_vector.columns = text_clf['vect'].get_feature_names_out()

In [25]:
pd.set_option('display.max_rows', None)
#df_vector.T

In [26]:
#df_train.iloc[0][text_source]

In [27]:
X_train = text_clf.fit(df_train[text_source], df_train['category'])

In [28]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names_out(), 
                           'importance': text_clf['clf'].feature_importances_})

In [29]:
pysqldf("""
SELECT 
    *
FROM
    feature_df
ORDER BY
    importance DESC
""").head(10)

Unnamed: 0,feature,importance
0,injuries,0.009546
1,hiring,0.007952
2,cigarette,0.007839
3,ingredient,0.007627
4,smaller,0.007579
5,issued,0.007474
6,smoke,0.006768
7,confident,0.006547
8,question,0.006364
9,healthy,0.00629


In [30]:
feature_df.sort_values(by=['importance'], ascending=False)

Unnamed: 0,feature,importance
5957,injuries,0.009545731
5496,hiring,0.007952481
2005,cigarette,0.007839015
5931,ingredient,0.007626637
10829,smaller,0.007578607
6181,issued,0.007473633
10844,smoke,0.006767513
2414,confident,0.006546852
9366,question,0.006363629
5384,healthy,0.006289574


In [31]:
cross_val = cross_val_score(text_clf, df_test[text_source], df_test['category'], cv=4)

In [32]:
cross_val

array([0.90243902, 0.90243902, 0.9       , 0.9       ])

In [33]:
y_proba = text_clf.predict_proba(df_test[text_source])
y_pred = text_clf.predict(df_test[text_source])

In [34]:
y_proba

array([[1.  , 0.  ],
       [0.86, 0.14],
       [0.88, 0.12],
       [0.96, 0.04],
       [0.85, 0.15],
       [0.97, 0.03],
       [0.91, 0.09],
       [0.98, 0.02],
       [0.96, 0.04],
       [0.86, 0.14],
       [1.  , 0.  ],
       [0.88, 0.12],
       [0.99, 0.01],
       [0.8 , 0.2 ],
       [0.63, 0.37],
       [0.95, 0.05],
       [0.84, 0.16],
       [0.84, 0.16],
       [0.96, 0.04],
       [0.97, 0.03],
       [0.9 , 0.1 ],
       [0.96, 0.04],
       [0.96, 0.04],
       [0.83, 0.17],
       [0.82, 0.18],
       [0.9 , 0.1 ],
       [0.92, 0.08],
       [0.86, 0.14],
       [0.86, 0.14],
       [0.89, 0.11],
       [0.97, 0.03],
       [0.81, 0.19],
       [0.92, 0.08],
       [0.94, 0.06],
       [0.96, 0.04],
       [0.98, 0.02],
       [0.93, 0.07],
       [0.92, 0.08],
       [0.86, 0.14],
       [0.9 , 0.1 ],
       [0.91, 0.09],
       [0.98, 0.02],
       [0.91, 0.09],
       [0.95, 0.05],
       [0.82, 0.18],
       [0.91, 0.09],
       [0.94, 0.06],
       [0.94,

In [35]:
text_clf.classes_

array([0, 1])

In [36]:
prob_no_match = [p[0] for p in y_proba]
prob_match = [p[1] for p in y_proba]

In [37]:
df_test['prob_no_match'] = prob_no_match
df_test['prob_match'] = prob_match
df_test['y_pred'] = y_pred

In [38]:
# rename y_match?
y_adj = []

for p in prob_match:
    if p <= .5:
        y_adj.append("Not")
    else:
        y_adj.append("Match")
    
df_test['y_adj'] = y_adj

In [39]:
df_test[['category', 'y_pred', 'y_adj', 'prob_no_match', 'prob_match']]

Unnamed: 0_level_0,category,y_pred,y_adj,prob_no_match,prob_match
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
yhhh0100,0,0,Not,1.0,0.0
zgkv0183,0,0,Not,0.86,0.14
fxgb0191,0,0,Not,0.88,0.12
zgfk0146,0,0,Not,0.96,0.04
hgxh0257,0,0,Not,0.85,0.15
zxhb0039,0,0,Not,0.97,0.03
zxxb0079,0,0,Not,0.91,0.09
fhhb0039,0,0,Not,0.98,0.02
tkhb0039,0,0,Not,0.96,0.04
tqgp0190,0,0,Not,0.86,0.14


In [40]:
pysqldf("""SELECT * FROM df_test WHERE `y_pred` != category""")

Unnamed: 0,ID,subject,transcript,category,prob_no_match,prob_match,y_pred,y_adj
0,fypj0015,tobacco; cigarette; glycerine; health; filter;,imagine cigarette smoke disappears like smells like imagine cigare...,1,0.96,0.04,0,Not
1,jmhb0039,tobacco; cigarette; health; effects;,would like thank taking time morning time like apologize relativel...,1,0.86,0.14,0,Not
2,jtgp0190,tobacco; cigarette;,ingredient cigarette smoke identified injuries human health confid...,0,0.22,0.78,1,Match
3,gzgp0190,tobacco; cigarette; ingredients;,shown think responsibility responsive also concerned charges level...,0,0.39,0.61,1,Match
4,fpgl0191,tobacco; cigarette; health; death;,mr johnson want start testimony colleague seem almost fanatical in...,1,0.83,0.17,0,Not
5,hngp0190,tobacco; cigarette; health,david brinkley washington john chancellor new york good evening to...,1,0.82,0.18,0,Not
6,hsvk0086,tobacco; cigarette; lungs; health; toxin;,martha washington hospital chicago sue hoffman s first night with...,1,0.89,0.11,0,Not
7,grhb0039,tobacco; cigarette; health; profit;,ladies gentlemen senior vice president marketing rj artist develop...,1,0.86,0.14,0,Not
8,grfy0038,tobacco; cigarette; health; lawsuits;,hello welcome i m mark firestone vice president associate general ...,1,0.94,0.06,0,Not
9,yxxb0079,tobacco; cigarette,new smokeless cigarettes already focus criticism controversy new e...,0,0.39,0.61,1,Match


In [41]:
pysqldf(f"""
SELECT df_test.*, df.{text_source}, df.category
FROM df_test
JOIN df
ON df_test.ID = df.ID
""")

Unnamed: 0,ID,subject,transcript,category,prob_no_match,prob_match,y_pred,y_adj,transcript.1,category.1
0,yhhh0100,tobacco; cigarette,john wingate reporting cigarette popularity story cigarette front ...,0,1.0,0.0,0,Not,john wingate reporting cigarette popularity story cigarette front ...,0
1,zgkv0183,tobacco; cigarette,cappuccino davis woman must affect us says approve puppy know my w...,0,0.86,0.14,0,Not,cappuccino davis woman must affect us says approve puppy know my w...,0
2,fxgb0191,tobacco; cigarette; taste; time;,come flavor famous marlboro red extra long marlboro come marlboro ...,0,0.88,0.12,0,Not,come flavor famous marlboro red extra long marlboro come marlboro ...,0
3,zgfk0146,tobacco; cigarette; news; debate,question question guess protect unborn children know wait line sto...,0,0.96,0.04,0,Not,question question guess protect unborn children know wait line sto...,0
4,hgxh0257,lawsuit; legal activity; litigation; opioids; sales; marketing;,record i d like hand document mark exhibit you see that front is r...,0,0.85,0.15,0,Not,record i d like hand document mark exhibit you see that front is r...,0
5,zxhb0039,tobacco; cigarette; sales; marketing,camel winston s term three years ago camel old outdated brand rapi...,0,0.97,0.03,0,Not,camel winston s term three years ago camel old outdated brand rapi...,0
6,zxxb0079,tobacco; cigarette; company logos; children; research; study,simpler task products logos include measure exactly okay like nati...,0,0.91,0.09,0,Not,simpler task products logos include measure exactly okay like nati...,0
7,fhhb0039,tobacco; cigarette; advertise; appeal;,hello doug sarah mcclurkin think time program talked know one ciga...,0,0.98,0.02,0,Not,hello doug sarah mcclurkin think time program talked know one ciga...,0
8,tkhb0039,tobacco; cigarette; advertising; new product,imagine cigarette smoke disappears like smells like imagine cigare...,0,0.96,0.04,0,Not,imagine cigarette smoke disappears like smells like imagine cigare...,0
9,tqgp0190,tobacco; cigarette,take controversy cigarettes st congress house committee heard test...,0,0.86,0.14,0,Not,take controversy cigarettes st congress house committee heard test...,0


In [42]:
y_pred = text_clf.predict(df_test[text_source])

In [43]:
# everything below this line will vary based on the run

In [44]:
precision, recall, fscore, train_support = precision_recall_fscore_support(df_test['category'], y_pred, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {} / fscore: {}'.format(
    round(precision, 3), round(recall, 3), (round((y_pred==df_test['category']).sum()/len(y_adj), 3)), round(fscore, 3)))

Precision: 0.143 / Recall: 0.062 / Accuracy: 0.87 / fscore: 0.087


In [45]:
print('f-score', 2 * ((precision * recall) / (precision + recall)))

f-score 0.08695652173913043


In [46]:
# note: is 30 matching videos not enough? probably not bc only about 9%
