In [32]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import string
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

In [2]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification")

No model was supplied, defaulted to facebook/bart-large-mnli (https://huggingface.co/facebook/bart-large-mnli)


In [3]:
def ultimate_tokenize(sentence):
    # Remove punctuation and digits
    sentence = sentence.translate(str.maketrans('', '', string.punctuation + string.digits))
    return word_tokenize(sentence.lower())

In [4]:
def cleaning(interview):
    
    tokens = ultimate_tokenize(interview)
    from nltk.corpus import stopwords
    
    stops = stopwords.words('english')
    stops.extend(['yeah','hello','ye','yes','okay','ok'])
    stops.extend('.,[,],(,),;,/,-,\',?,",:,<,>,n\'t,|,#,\'s,\",\'re,\'ve,\'ll,\'d,\'re,’'.split(','))
    stops.extend(',')
    
    # 6. Remove stop words. 
    words = [w for w in tokens if not w in stops]
    
    sent = ' '.join(words)
    return sent

In [5]:
# Setting directory structure
root_dir = 'C:\\Users\\yashd\\Desktop\\rethink-media\\quote-classification\\'
data_dir = 'Data\\cleaned_data\\'

In [6]:
# Importing cleaned sample data for EDA
os.chdir(root_dir)
GNI88_df = pd.read_csv(root_dir + data_dir + 'GNI88_cleaned_data.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
GNI88_df.columns

Index(['artdate', 'Article ID', 'Headline', 'Quote ID', 'Article Status',
       'Article Position', 'Messages', 'Submessages', 'Quote Position',
       'Legacy Quote Tag', 'On/Off Message', 'QText', 'Source Name',
       'Source Type', 'Source Party Affiliation', 'Source Ethnicity',
       'Source Nationality', 'Source Gender', 'Source Religion',
       'Legacy Source Tag', 'Constituent Group', 'Media Name', 'Media Medium',
       'Journalist Name', 'Constituent Author', 'Article Issues',
       'Custom Group', 'Media Group', 'fulltext', 'source_name_cleaned'],
      dtype='object')

In [13]:
GNI88_df['Legacy Quote Tag'].value_counts()

Grand List     237
Off-Message    128
Name: Legacy Quote Tag, dtype: int64

In [14]:
GNI88_df['On/Off Message'].value_counts()

Not On Message    4571
On Message        1537
Name: On/Off Message, dtype: int64

In [16]:
# Dropping the 1 float value quote text to conduct analysis
GNI88_df = GNI88_df[(GNI88_df['QText'].map(lambda x : type(x)) != float)]
shortest_quote = min(GNI88_df['QText'].astype(str), key = len)
longest_quote = max(GNI88_df['QText'], key = len)
num_words_min = len(shortest_quote.split())
num_words_max = len(longest_quote.split())
GNI88_df['QText'].str.len().mean()

170.82429330504675

In [17]:
GNI88_df['QText'].str.len().min()

1

In [18]:
GNI88_df['Source Name'].isna().value_counts()

False    404302
True       3901
Name: Source Name, dtype: int64

In [19]:
GNI88_df['Source Name'].value_counts()

Donald Trump OLD                          15091
Barack Obama OLD                           9458
Unnamed Obama White House Official OLD     9428
Unnamed Analyst/Expert                     8597
Benjamin Netanyahu                         5977
                                          ...  
Aleksey Pushkov                               1
Hesham Ghanbari                               1
Ihor Romanenko                                1
Mikhail Margelov                              1
Steve Bell                                    1
Name: Source Name, Length: 16921, dtype: int64

In [20]:
len(GNI88_df['Source Name'].unique())

16922

In [21]:
', '.join(GNI88_df['Source Type'].value_counts().index.tolist())

'Foreign Gov/Mil Official, Federal Official, Media/Journalist, Analyst/Commentator, US Senate & Staff, Nuke Organization, Former Admin. Officials, US Rep. & Staff, US Military, Academic, Nuke Organization - Academic, International Orgs, Citizen, Non-Profit/NGO, Think Tanks, Partisans/Fmr. Politicians, Corporate Official, Regulator, State/Local Official, Other, Blogger, Public Polling, Religious/Clerical, Nuclear Scientist, Attorney, Terrorist/Extremist, US Police, Judicial Official, Nuclear Official, Military, Defense, Former DIA intelligence, Former Soviet Military Officer, Deputy, Defense Forces, Activist, Chairman, retired US Military, P & S - Former Government, Director, Ambassador, Information minister, Ministry, EU Official, Former Ambassador, diplomat, Research Group, Embassy, South Korean Official, Former Russian Official, Backend developer at Skylum, a software developing company'

In [22]:
GNI88_df['Source Party Affiliation'].value_counts()

Republican     42808
Democrat       33044
Independent      556
The hill           1
Name: Source Party Affiliation, dtype: int64

In [23]:
GNI88_df['Source Nationality'].value_counts()

United States    200638
Iran              30660
Israel            13694
Russia            11988
South Korea       11178
                  ...  
Ucrania               1
Guyana                1
Peru                  1
Liechtenstein         1
Chechnya              1
Name: Source Nationality, Length: 120, dtype: int64

In [24]:
GNI88_df.dtypes

artdate                     object
Article ID                   int64
Headline                    object
Quote ID                     int64
Article Status              object
Article Position            object
Messages                    object
Submessages                 object
Quote Position              object
Legacy Quote Tag            object
On/Off Message              object
QText                       object
Source Name                 object
Source Type                 object
Source Party Affiliation    object
Source Ethnicity            object
Source Nationality          object
Source Gender               object
Source Religion             object
Legacy Source Tag           object
Constituent Group           object
Media Name                  object
Media Medium                object
Journalist Name             object
Constituent Author          object
Article Issues              object
Custom Group                object
Media Group                 object
fulltext            

In [25]:
GNI88_df['Source Gender'].value_counts()

Male            233190
Unknown         115731
Female           28258
Organization     21658
Name: Source Gender, dtype: int64

In [26]:
GNI88_df['Source Religion'].value_counts()

Unknown    343877
Name: Source Religion, dtype: int64

In [27]:
GNI88_df['Source Party Affiliation'].isna().value_counts()

True     331794
False     76409
Name: Source Party Affiliation, dtype: int64

In [28]:
GNI88_df['Source Ethnicity'].value_counts()

Person of Color    4824
Name: Source Ethnicity, dtype: int64

In [29]:
top_words = GNI88_df['QText'].str.split().explode().value_counts()

In [30]:
top_words[top_words.to_frame()['QText'].map(lambda x: x > 5)]

the               629073
to                405971
of                276685
and               269593
a                 257785
                   ...  
decades:               6
1600                   6
slowing,               6
counterfeiting         6
Them                   6
Name: QText, Length: 44202, dtype: int64

In [34]:
%%time
stopwords_removed = GNI88_df['QText'].map(cleaning)
stopwords_removed

Wall time: 9min 14s


0         add emergency missile defense ship repair mone...
1         modest expectations ability predict kim jong u...
2                                        rhetoric president
3         could result loss millions lives possibility n...
4         essentially transactional weve pretty stable r...
                                ...                        
408199    trump administration strategy preemptive blood...
408200    preference achieve denuclearization korean pen...
408201    president lacks authorityfor attack wouldspark...
408202    completely different machine said adding would...
408203    russia intervened syrian civil war aid preside...
Name: QText, Length: 408203, dtype: object

In [36]:
stopwords_removed.str.split().explode().value_counts()

nuclear             154707
iran                 89547
us                   64631
north                64069
would                59666
                     ...  
americaninspired         1
jovially                 1
uraniumremoval           1
darnednear               1
wouldspark               1
Name: QText, Length: 68855, dtype: int64