In [31]:
import pandas as pd
import re
from pandasql import sqldf
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [32]:
pd.set_option('display.max_colwidth', 70)
pd.set_option('display.max_rows', None)
pysqldf = lambda q: sqldf(q, globals())

In [33]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/noelsalmeron/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/noelsalmeron/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [34]:
# setting a couple of workbook variables
text_source = "transcript"
tag = "lawsuit"

# text cleaning variables
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [35]:
df_human = pd.read_csv("data/classifications.csv")
df_ml = pd.read_csv("data/transcript_accuracy.csv")

In [36]:
df_dataset = pysqldf(f"""
SELECT dh.*, dml.{text_source}
FROM 
    df_human dh
JOIN
    df_ml dml
ON dh.ID = dml.id
    WHERE description is not NULL
    AND description != 'error code 224003'
""")

In [37]:
len(df_dataset)

341

In [38]:
# lowercase using for loop and new array
#transcript_col = df_dataset['transcript']
#transcripts = []
#for t in transcript_col:
#    transcripts.append(t.lower())
    
# lowercase using apply
df_dataset['transcript'] = df_dataset['transcript'].apply(str.lower, 'transcript')

#df_dataset.head()

In [39]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/noelsalmeron/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
# using list of stop words from NLTK library 
# + p.m. and a.m. which commonly show up in transcripts
stop_words = stopwords.words('english') + ['we\'re', 'p.m.,', 'a.m.', 'okay', 'yes']
#print(stop_words)

def remove_stop_words(str):
    new_str = ''
    for word in str.split(): 
        if word not in stop_words:
            new_str = new_str + word + ' '
    return new_str[:-1] # removes last space

#remove_stop_words('This is a sentence.')

df_dataset['transcript'] = df_dataset['transcript'].apply(remove_stop_words, 'transcript')
df_dataset.head(5)

Unnamed: 0,ID,description,subject,title,runtime,transcript
0,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record 12:49 took lunch break mrs. dilber talking exhibit number 9...
1,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain't 9 5 j...
2,fgxh0257,Deposition of Mark Trudeau in Re: State of Rhode Island v Purdue P...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Mark Trudeau, President and CEO",0:52:40,back record 551 questions witness concludes mr. trudeau compositio...
3,fhfk0146,CNN debate on the ban of smoking on Airplanes. Ban is put in place...,tobacco; cigarette;secondhand smoke; closed environment;,CNN Crossfire debate on smoking in public,0:26:20,tonight washington crossfire saturday smoking skies ifs ands butts...
4,fhgb0191,Compilation of ads for children to say no to smoking.,tobacco; cigarette; youth access; PSA; children; future;,Think. Don't Smoke. ad campaign compilation,0:34:02,hey guys may ask couple questions ever tried cigarettes know reall...


In [41]:
# remove everything that isn't alphanumeric
def alphabetic(str):
    return re.sub(r'[^a-zA-Z]+', ' ', str)

# alphabetic("hey 3there00 bud 129.129")

df_dataset['transcript'] = df_dataset['transcript'].apply(alphabetic, 'transcript')
df_dataset.head()

Unnamed: 0,ID,description,subject,title,runtime,transcript
0,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record took lunch break mrs dilber talking exhibit number page num...
1,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain t job k...
2,fgxh0257,Deposition of Mark Trudeau in Re: State of Rhode Island v Purdue P...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Mark Trudeau, President and CEO",0:52:40,back record questions witness concludes mr trudeau composition tha...
3,fhfk0146,CNN debate on the ban of smoking on Airplanes. Ban is put in place...,tobacco; cigarette;secondhand smoke; closed environment;,CNN Crossfire debate on smoking in public,0:26:20,tonight washington crossfire saturday smoking skies ifs ands butts...
4,fhgb0191,Compilation of ads for children to say no to smoking.,tobacco; cigarette; youth access; PSA; children; future;,Think. Don't Smoke. ad campaign compilation,0:34:02,hey guys may ask couple questions ever tried cigarettes know reall...


In [42]:
# columns for stem and lemmatizing
stemmer = PorterStemmer()
def stem(str):
    new_str = ''
    for word in str.split():
        new_str += stemmer.stem(word) + ' '
    return new_str

df_dataset['stemmed'] = df_dataset['transcript'].apply(stem, 'transcript')
df_dataset.head()

Unnamed: 0,ID,description,subject,title,runtime,transcript,stemmed
0,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record took lunch break mrs dilber talking exhibit number page num...,record took lunch break mr dilber talk exhibit number page number ...
1,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain t job k...,get pretti late hit trail snoop around live like ain t job knew ki...
2,fgxh0257,Deposition of Mark Trudeau in Re: State of Rhode Island v Purdue P...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Mark Trudeau, President and CEO",0:52:40,back record questions witness concludes mr trudeau composition tha...,back record question wit conclud mr trudeau composit thank sir tha...
3,fhfk0146,CNN debate on the ban of smoking on Airplanes. Ban is put in place...,tobacco; cigarette;secondhand smoke; closed environment;,CNN Crossfire debate on smoking in public,0:26:20,tonight washington crossfire saturday smoking skies ifs ands butts...,tonight washington crossfir saturday smoke sky if and butt left pa...
4,fhgb0191,Compilation of ads for children to say no to smoking.,tobacco; cigarette; youth access; PSA; children; future;,Think. Don't Smoke. ad campaign compilation,0:34:02,hey guys may ask couple questions ever tried cigarettes know reall...,hey guy may ask coupl question ever tri cigarett know realli much ...


In [43]:
# remove one or two character pairs
def character_clean(str):
    new_str = ''
    for word in str.split():
        if len(word) > 2:
            new_str += word + ' '
    return new_str

# character_clean('This is a sentence')
df_dataset['transcript'] = df_dataset['transcript'].apply(character_clean, 'transcript')
df_dataset['stemmed'] = df_dataset['stemmed'].apply(character_clean, 'stemmed')
df_dataset.head()

Unnamed: 0,ID,description,subject,title,runtime,transcript,stemmed
0,ffxh0257,Deposition of Jeff Kilper in RE: National Prescription Opiate Liti...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record took lunch break mrs dilber talking exhibit number page num...,record took lunch break dilber talk exhibit number page number cou...
1,fghb0039,Camel Cigarettes infomercial on why their cigarettes are superior ...,tobacco; cigarette; marketing;,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain job kne...,get pretti late hit trail snoop around live like ain job knew kind...
2,fgxh0257,Deposition of Mark Trudeau in Re: State of Rhode Island v Purdue P...,lawsuit; legal activity; litigation; opioids; sales; marketing;,"Deposition of Mark Trudeau, President and CEO",0:52:40,back record questions witness concludes trudeau composition thank ...,back record question wit conclud trudeau composit thank sir thank ...
3,fhfk0146,CNN debate on the ban of smoking on Airplanes. Ban is put in place...,tobacco; cigarette;secondhand smoke; closed environment;,CNN Crossfire debate on smoking in public,0:26:20,tonight washington crossfire saturday smoking skies ifs ands butts...,tonight washington crossfir saturday smoke sky and butt left part ...
4,fhgb0191,Compilation of ads for children to say no to smoking.,tobacco; cigarette; youth access; PSA; children; future;,Think. Don't Smoke. ad campaign compilation,0:34:02,hey guys may ask couple questions ever tried cigarettes know reall...,hey guy may ask coupl question ever tri cigarett know realli much ...


In [44]:
df_dataset = df_dataset.drop(['description', 'subject'], axis=1)
df_dataset.head(10)

Unnamed: 0,ID,title,runtime,transcript,stemmed
0,ffxh0257,"Deposition of Jeff Kilper, Senior Director for Finance for Special...",1:07:17,record took lunch break mrs dilber talking exhibit number page num...,record took lunch break dilber talk exhibit number page number cou...
1,fghb0039,Camel Cigarettes informercial,00:20:51,getting pretty late hit trail snoop around living like ain job kne...,get pretti late hit trail snoop around live like ain job knew kind...
2,fgxh0257,"Deposition of Mark Trudeau, President and CEO",0:52:40,back record questions witness concludes trudeau composition thank ...,back record question wit conclud trudeau composit thank sir thank ...
3,fhfk0146,CNN Crossfire debate on smoking in public,0:26:20,tonight washington crossfire saturday smoking skies ifs ands butts...,tonight washington crossfir saturday smoke sky and butt left part ...
4,fhgb0191,Think. Don't Smoke. ad campaign compilation,0:34:02,hey guys may ask couple questions ever tried cigarettes know reall...,hey guy may ask coupl question ever tri cigarett know realli much ...
5,fhhb0039,RJ Reynolds Tobacco board meeting on advertisment,0:15:07,hello doug sarah mcclurkin think time program talked know one ciga...,hello doug sarah mcclurkin think time program talk know one cigare...
6,fhhh0100,Camel cigarette advertisment compilation,0:27:35,ever heard man literally walk wall human fly well looking hundred ...,ever heard man liter walk wall human fli well look hundr feet mari...
7,fhhj0223,ABC 7 news report: WeCard,0:09:08,top story tonight indiana retail associations joining forces keep ...,top stori tonight indiana retail associ join forc keep tobacco rea...
8,fhwm0256,Opiod prescriber interview,0:01:35,tell remember detail happened iraq benefits drug used force rememb...,tell rememb detail happen iraq benefit drug use forc rememb stuff ...
9,fjhb0039,Winston No Bull comedy break,0:17:38,welcome winston bull comedy break guys gonna right watching gerald...,welcom winston bull comedi break guy gonna right watch geraldo com...


In [45]:
df_dataset.to_csv('data/cleaned_data.csv', index=False)