In [0]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import os
import tensorflow as tf 
from google.colab import drive

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

drive.mount("/content/gdrive", force_remount=True)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk import pos_tag, word_tokenize
import numpy as np


sent = "US bloggers banned from entering UK"

def count_tags_sentence(sentence, tag_prefix):
    """
    Counts the number of words in the given sentence
    whose PoS tag matches the given tag_prefix.

    :param string sentence: the sentence to process
    :param string tag_prefix: Penn Treebank tag prefix (e.g. JJ, RB)
    :return: number of words with the tag, matching the tag_prefix
    """
    tags = pos_tag(word_tokenize(sentence.lower()))
    return len(list(filter(lambda tag: tag[1].startswith(tag_prefix), tags)))

def count_adjectives(sentences):
    return list(map(lambda s: [count_tags_sentence(s, tag_prefix='JJ')], sentences))

def count_adverbs(sentences):
    return list(map(lambda s: [count_tags_sentence(s, tag_prefix='RB')], sentences))

def count_singular_pronouns(sentences):
    return list(map(lambda s: [count_tags_sentence(s, tag_prefix='NNP')], sentences))

def count_plural_pronouns(sentences):
    return list(map(lambda s: [count_tags_sentence(s, tag_prefix='NNPS')], sentences))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Found GPU at: /device:GPU:0
Mounted at /content/gdrive
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [0]:
def update_dataset(option):
  
  source_dir = '/content/gdrive/My Drive/datathon/train/'
  files = os.listdir(source_dir)
  
  for f in files:
    if f.endswith('.txt'):
      if os.path.isfile('/content/gdrive/My Drive/datathon/train/' + f) and os.path.isfile('/content/gdrive/My Drive/datathon/train/' + f[:-3] + 'task2.labels'):
        with open('/content/gdrive/My Drive/datathon/train/' + f) as f_text, open('/content/gdrive/My Drive/datathon/train/' + f[:-3] + 'task2.labels', 'r') as f_labs:
          lines_f_text = f_text.readlines()
          lines_f_labs = f_labs.readlines()
          for i, l in enumerate(lines_f_text):
            if l == '\n':
              continue
            else:
              if option == "tags":
                n_adj = count_tags_sentence(l[:-1], 'JJ')
                n_adv = count_tags_sentence(l[:-1], 'RB')
                n_singular_pronouns = count_tags_sentence(l[:-1], 'NNP')
                n_plural_pronouns = count_tags_sentence(l[:-1], 'NNPS')
                line_to_write = '\t'.join([str(n) for n in [n_adj, n_adv, n_singular_pronouns, n_plural_pronouns]])
                lines_f_labs[i] = lines_f_labs[i].rstrip() + '\t' + line_to_write + '\n'
                #f_labs_aug.write(lines_f_labs[i][:-1] + '\t' + line_to_write + '\n')

        with open('/content/gdrive/My Drive/datathon/train/' + f[:-3] + 'task2.labels', 'w') as f_labs_2:
          f_labs_2.writelines(lines_f_labs) 

FileNotFoundError: ignored

In [0]:
%matplotlib inline

# Import libraries
import pandas as pd
import numpy as np
import os
import re
import glob
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, GlobalMaxPool1D, Dense, Dropout

from sklearn.metrics import f1_score, confusion_matrix

import glob


def dataprep_task2(path):
    """Dataprep for Task2 It will return the new data
    :param path: Path to the article's taks3 labels file.
    Example:
    >>> dataprep_task2("datasets-v5/tasks-2-3/train/article111111112.task2.labels")
    """
    dir_name = os.path.dirname(path)
    article_id = os.path.basename(path).split('.')[0]
    article_name = os.path.join(dir_name, f'{article_id}.txt')

    with open(article_name, 'r', encoding='utf8') as f:
        records = f.readlines()

    df = pd.DataFrame(records, columns=['sentences'])

    another_df = pd.read_csv(path, sep='\t', names = ['article', 'N_sentence', 'is_propaganda', 'sadness' , 'joy' ,'fear', 'disgust' ,'anger'], encoding='utf8')
    
    result_df = pd.concat([df, another_df], axis=1)
    
    return result_df.loc[result_df['sentences'] != '\n', :]
  
  

Using TensorFlow backend.


In [0]:
df['sentences_prep'] = df['sentences'].apply(clean_text)

In [0]:
afd = dataprep_task2('/content/gdrive/My Drive/datathon/train/article111111112.task2.labels')

In [0]:
fileNames = glob.glob('/content/gdrive/My Drive/datathon/train/*.task2.labels')

In [0]:
afd.head(10)

Unnamed: 0,sentences,article,N_sentence,is_propaganda,sadness,joy,fear,disgust,anger
0,US bloggers banned from entering UK\n,111111112,1,non-propaganda,0.273694,0.104768,0.12182,0.378717,0.378898
2,Two prominent US bloggers have been banned fro...,111111112,3,non-propaganda,0.322924,0.158435,0.195255,0.267676,0.273668
4,Pamela Geller and Robert Spencer co-founded an...,111111112,5,propaganda,0.273383,0.068818,0.156252,0.556396,0.241289
6,They were due to speak at an English Defence L...,111111112,7,non-propaganda,0.509438,0.087796,0.126802,0.149326,0.254227
8,A government spokesman said individuals whose ...,111111112,9,non-propaganda,0.124354,0.568301,0.147631,0.22066,0.084066
10,"He added: ""We condemn all those whose behaviou...",111111112,11,propaganda,0.119858,0.170223,0.072858,0.523191,0.035549
12,'Right decision'\n,111111112,13,non-propaganda,0.144572,0.267605,0.123545,0.085044,0.163631
13,"Ms Geller, of the Atlas Shrugs blog, and Mr Sp...",111111112,14,propaganda,0.35066,0.370516,0.262476,0.128524,0.033058
14,On both of their blogs the pair called their b...,111111112,15,propaganda,0.521116,0.037132,0.086949,0.201311,0.495579
16,They were due to attend a march planned by the...,111111112,17,non-propaganda,0.614666,0.02863,0.15145,0.314787,0.107312


In [0]:
res_list = list()

for f in fileNames:
    res_list.append(dataprep_task2(f))

In [0]:
df = pd.concat(res_list)

In [0]:
df.head(10)

Unnamed: 0,sentences,article,N_sentence,is_propaganda,sadness,joy,fear,disgust,anger
0,New Audio From The Night Of The Las Vegas Mass...,704856340,1,non-propaganda,0.093891,0.481352,0.238193,0.133129,0.162108
2,Newly released audio from the Clark County Fir...,704856340,3,non-propaganda,0.456951,0.024205,0.162442,0.419307,0.281755
3,"The audio, released on the SoundCloud account ...",704856340,4,non-propaganda,0.193504,0.280196,0.125014,0.140594,0.36693
4,"Intellihub’s Shepard Ambellas, who has extensi...",704856340,5,non-propaganda,0.674633,0.155427,0.046534,0.116447,0.051066
5,“We have a firefighter’s wife at this event wh...,704856340,6,non-propaganda,0.356464,0.171645,0.098791,0.098959,0.272835
6,"We are trying to get further on the name,” dis...",704856340,7,non-propaganda,0.395951,0.058361,0.141609,0.075802,0.107938
7,"“Batallion 6, be advised that we are getting r...",704856340,8,non-propaganda,0.082155,0.041201,0.110452,0.028624,0.071853
8,"After being asked to confirm the information, ...",704856340,9,non-propaganda,0.457318,0.065556,0.055998,0.069855,0.213325
9,“The only information I have is it’s the bar o...,704856340,10,non-propaganda,0.477513,0.157989,0.067258,0.036316,0.191752
10,"Interestingly, the story doesn’t end there, as...",704856340,11,non-propaganda,0.109102,0.687029,0.089832,0.021488,0.131763


In [0]:
df['sentences'] = df['sentences'].str.replace('\n', '')

In [0]:
df.to_csv('/content/gdrive/My Drive/datathon/task2.csv', sep='|||', index=False)

TypeError: ignored

In [0]:
df.loc[df['article'] == 111111112]


Unnamed: 0,sentences,article,N_sentence,is_propaganda,sadness,joy,fear,disgust,anger
0,US bloggers banned from entering UK\n,111111112,1,non-propaganda,0.273694,0.104768,0.12182,0.378717,0.378898
2,Two prominent US bloggers have been banned fro...,111111112,3,non-propaganda,0.322924,0.158435,0.195255,0.267676,0.273668
4,Pamela Geller and Robert Spencer co-founded an...,111111112,5,propaganda,0.273383,0.068818,0.156252,0.556396,0.241289
6,They were due to speak at an English Defence L...,111111112,7,non-propaganda,0.509438,0.087796,0.126802,0.149326,0.254227
8,A government spokesman said individuals whose ...,111111112,9,non-propaganda,0.124354,0.568301,0.147631,0.22066,0.084066
10,"He added: ""We condemn all those whose behaviou...",111111112,11,propaganda,0.119858,0.170223,0.072858,0.523191,0.035549
12,'Right decision'\n,111111112,13,non-propaganda,0.144572,0.267605,0.123545,0.085044,0.163631
13,"Ms Geller, of the Atlas Shrugs blog, and Mr Sp...",111111112,14,propaganda,0.35066,0.370516,0.262476,0.128524,0.033058
14,On both of their blogs the pair called their b...,111111112,15,propaganda,0.521116,0.037132,0.086949,0.201311,0.495579
16,They were due to attend a march planned by the...,111111112,17,non-propaganda,0.614666,0.02863,0.15145,0.314787,0.107312


In [0]:
df2 = pd.read_csv('/content/gdrive/My Drive/datathon/task2data.csv', sep='\t')

In [0]:
df2.head(10)

Unnamed: 0,sentences,article,N_sentence,is_propaganda,sadness,joy,fear,disgust,anger
0,New Audio From The Night Of The Las Vegas Mass...,704856340,1,non-propaganda,0.093891,0.481352,0.238193,0.133129,0.162108
1,Newly released audio from the Clark County Fir...,704856340,3,non-propaganda,0.456951,0.024205,0.162442,0.419307,0.281755
2,"The audio, released on the SoundCloud account ...",704856340,4,non-propaganda,0.193504,0.280196,0.125014,0.140594,0.36693
3,"Intellihub’s Shepard Ambellas, who has extensi...",704856340,5,non-propaganda,0.674633,0.155427,0.046534,0.116447,0.051066
4,“We have a firefighter’s wife at this event wh...,704856340,6,non-propaganda,0.356464,0.171645,0.098791,0.098959,0.272835
5,"We are trying to get further on the name,” dis...",704856340,7,non-propaganda,0.395951,0.058361,0.141609,0.075802,0.107938
6,"“Batallion 6, be advised that we are getting r...",704856340,8,non-propaganda,0.082155,0.041201,0.110452,0.028624,0.071853
7,"After being asked to confirm the information, ...",704856340,9,non-propaganda,0.457318,0.065556,0.055998,0.069855,0.213325
8,“The only information I have is it’s the bar o...,704856340,10,non-propaganda,0.477513,0.157989,0.067258,0.036316,0.191752
9,"Interestingly, the story doesn’t end there, as...",704856340,11,non-propaganda,0.109102,0.687029,0.089832,0.021488,0.131763


In [0]:
df == df2

ValueError: ignored

In [0]:
df2.shape

(14263, 9)

In [0]:
df.shape

(14263, 9)

In [0]:
df.dtypes

sentences         object
article            int64
N_sentence         int64
is_propaganda     object
sadness          float64
joy              float64
fear             float64
disgust          float64
anger            float64
dtype: object

In [0]:
df2.dtypes

sentences         object
article            int64
N_sentence         int64
is_propaganda     object
sadness          float64
joy              float64
fear             float64
disgust          float64
anger            float64
dtype: object

In [0]:
df_text = df['sentences']

In [0]:
df_text.head()

0    New Audio From The Night Of The Las Vegas Mass...
2    Newly released audio from the Clark County Fir...
3    The audio, released on the SoundCloud account ...
4    Intellihub’s Shepard Ambellas, who has extensi...
5    “We have a firefighter’s wife at this event wh...
Name: sentences, dtype: object

In [0]:
df[['sentences']]

Unnamed: 0,sentences
0,New Audio From The Night Of The Las Vegas Mass...
2,Newly released audio from the Clark County Fir...
3,"The audio, released on the SoundCloud account ..."
4,"Intellihub’s Shepard Ambellas, who has extensi..."
5,“We have a firefighter’s wife at this event wh...
6,"We are trying to get further on the name,” dis..."
7,"“Batallion 6, be advised that we are getting r..."
8,"After being asked to confirm the information, ..."
9,“The only information I have is it’s the bar o...
10,"Interestingly, the story doesn’t end there, as..."


In [0]:
df.to_pickle('/content/gdrive/My Drive/datathon/task2data.pkl')

In [0]:
df3 = pd.read_pickle('/content/gdrive/My Drive/datathon/task2data.pkl')

In [0]:
df.equals(df3)

True

In [0]:
df.head(1000)

Unnamed: 0,sentences,article,N_sentence,is_propaganda,sadness,joy,fear,disgust,anger,target
0,New Audio From The Night Of The Las Vegas Mass...,704856340,1,non-propaganda,0.093891,0.481352,0.238193,0.133129,0.162108,0
2,Newly released audio from the Clark County Fir...,704856340,3,non-propaganda,0.456951,0.024205,0.162442,0.419307,0.281755,0
3,"The audio, released on the SoundCloud account ...",704856340,4,non-propaganda,0.193504,0.280196,0.125014,0.140594,0.366930,0
4,"Intellihub’s Shepard Ambellas, who has extensi...",704856340,5,non-propaganda,0.674633,0.155427,0.046534,0.116447,0.051066,0
5,“We have a firefighter’s wife at this event wh...,704856340,6,non-propaganda,0.356464,0.171645,0.098791,0.098959,0.272835,0
6,"We are trying to get further on the name,” dis...",704856340,7,non-propaganda,0.395951,0.058361,0.141609,0.075802,0.107938,0
7,"“Batallion 6, be advised that we are getting r...",704856340,8,non-propaganda,0.082155,0.041201,0.110452,0.028624,0.071853,0
8,"After being asked to confirm the information, ...",704856340,9,non-propaganda,0.457318,0.065556,0.055998,0.069855,0.213325,0
9,“The only information I have is it’s the bar o...,704856340,10,non-propaganda,0.477513,0.157989,0.067258,0.036316,0.191752,0
10,"Interestingly, the story doesn’t end there, as...",704856340,11,non-propaganda,0.109102,0.687029,0.089832,0.021488,0.131763,0


In [0]:
df.loc[df['target'] == 0]


Unnamed: 0,sentences,article,N_sentence,is_propaganda,sadness,joy,fear,disgust,anger,target
0,New Audio From The Night Of The Las Vegas Mass...,704856340,1,non-propaganda,0.093891,0.481352,0.238193,0.133129,0.162108,0
2,Newly released audio from the Clark County Fir...,704856340,3,non-propaganda,0.456951,0.024205,0.162442,0.419307,0.281755,0
3,"The audio, released on the SoundCloud account ...",704856340,4,non-propaganda,0.193504,0.280196,0.125014,0.140594,0.366930,0
4,"Intellihub’s Shepard Ambellas, who has extensi...",704856340,5,non-propaganda,0.674633,0.155427,0.046534,0.116447,0.051066,0
5,“We have a firefighter’s wife at this event wh...,704856340,6,non-propaganda,0.356464,0.171645,0.098791,0.098959,0.272835,0
6,"We are trying to get further on the name,” dis...",704856340,7,non-propaganda,0.395951,0.058361,0.141609,0.075802,0.107938,0
7,"“Batallion 6, be advised that we are getting r...",704856340,8,non-propaganda,0.082155,0.041201,0.110452,0.028624,0.071853,0
8,"After being asked to confirm the information, ...",704856340,9,non-propaganda,0.457318,0.065556,0.055998,0.069855,0.213325,0
9,“The only information I have is it’s the bar o...,704856340,10,non-propaganda,0.477513,0.157989,0.067258,0.036316,0.191752,0
10,"Interestingly, the story doesn’t end there, as...",704856340,11,non-propaganda,0.109102,0.687029,0.089832,0.021488,0.131763,0


3938 propaganda, 10325 non-propaganda

In [0]:
df.loc[df['article'] == 111111112]

In [0]:
df['target'] = df['is_propaganda'].map({'propaganda': 1, 'non-propaganda': 0})

In [0]:
text_only_df = df[['sentences','target']]

In [0]:
text_only_df.head(20)

Unnamed: 0,sentences,target
0,New Audio From The Night Of The Las Vegas Mass...,0
2,Newly released audio from the Clark County Fir...,0
3,"The audio, released on the SoundCloud account ...",0
4,"Intellihub’s Shepard Ambellas, who has extensi...",0
5,“We have a firefighter’s wife at this event wh...,0
6,"We are trying to get further on the name,” dis...",0
7,"“Batallion 6, be advised that we are getting r...",0
8,"After being asked to confirm the information, ...",0
9,“The only information I have is it’s the bar o...,0
10,"Interestingly, the story doesn’t end there, as...",0


In [0]:
SEED = 666

y = df[['target']]

X_train, X_test, y_train, y_test = train_test_split(
        df, y,stratify=y, test_size=0.2, random_state=SEED)


In [0]:

X_val, X_test_real, y_val, y_test_real = train_test_split(
        X_test, y_test,stratify=y_test, test_size=0.5, random_state=SEED)

In [0]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV


vectorizer = TfidfVectorizer(min_df = 3, max_df=0.5,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

In [0]:
X_train.head()

Unnamed: 0,sentences,article,N_sentence,is_propaganda,sadness,joy,fear,disgust,anger,target
31,The Nuncio added that Littleton had already fo...,782086447,32,non-propaganda,0.299419,0.027867,0.243245,0.177422,0.064517,0
21,FACT asked the Senate ethics committee to prob...,999000147,22,non-propaganda,0.38935,0.018483,0.065846,0.36244,0.294696,0
8,“There is a silence among many who call themse...,763260610,9,non-propaganda,0.688505,0.061684,0.211507,0.057255,0.11288,0
42,"The President of AMANA, Sofian Zakkout, has re...",728169864,43,propaganda,0.063294,0.450346,0.035087,0.310004,0.138506,1
81,Date of erection: 1994,761334950,82,non-propaganda,0.0442,0.195751,0.011521,0.15943,0.030789,0


In [0]:
x_train = vectorizer.fit_transform(X_train['sentences'])


In [0]:
x_train.shape

(11410, 12193)

In [0]:
type(x_train)

scipy.sparse.csr.csr_matrix

In [0]:
from scipy.sparse import hstack
model = LinearSVC(C=1.0, class_weight='balanced', multi_class='ovr', random_state=SEED)
model.fit(hstack([x_train, X_train[['sadness', 'joy', 'fear', 'disgust' , 'anger']].values]), y_train.values.ravel())

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=666, tol=0.0001,
     verbose=0)

In [0]:
X_train[['sadness', 'joy', 'fear', 'disgust' , 'anger']].values.shape

(11410, 5)

In [0]:
y_train_predict = model.predict(hstack([x_train, X_train[['sadness', 'joy', 'fear', 'disgust' , 'anger']].values]))

In [0]:
from sklearn.metrics import classification_report

print(classification_report(y_train.values.ravel(), y_train_predict))


              precision    recall  f1-score   support

           0       0.99      0.93      0.96      8260
           1       0.84      0.97      0.90      3150

   micro avg       0.94      0.94      0.94     11410
   macro avg       0.91      0.95      0.93     11410
weighted avg       0.95      0.94      0.94     11410



In [0]:
from sklearn.metrics import classification_report

print(classification_report(y_train.values.ravel(), y_train_predict))


              precision    recall  f1-score   support

           0       0.99      0.93      0.96      8260
           1       0.84      0.97      0.90      3150

   micro avg       0.94      0.94      0.94     11410
   macro avg       0.91      0.95      0.93     11410
weighted avg       0.95      0.94      0.94     11410



In [0]:
x_val = vectorizer.transform(X_val['sentences'])

In [0]:
y_val_predict = model.predict(X_val[['sadness', 'joy', 'fear', 'disgust' , 'anger']].values)

In [0]:
print(classification_report(y_val.values.ravel(), y_val_predict))
# tf idf only

              precision    recall  f1-score   support

           0       0.82      0.77      0.79      1032
           1       0.47      0.54      0.51       394

   micro avg       0.71      0.71      0.71      1426
   macro avg       0.64      0.66      0.65      1426
weighted avg       0.72      0.71      0.71      1426



In [0]:
print(classification_report(y_val.values.ravel(), y_val_predict))
# tf idf with emotions

              precision    recall  f1-score   support

           0       0.82      0.77      0.79      1032
           1       0.48      0.55      0.51       394

   micro avg       0.71      0.71      0.71      1426
   macro avg       0.65      0.66      0.65      1426
weighted avg       0.72      0.71      0.72      1426



In [0]:
print(classification_report(y_val.values.ravel(), y_val_predict))
# emotions only

              precision    recall  f1-score   support

           0       0.82      0.58      0.68      1032
           1       0.37      0.66      0.48       394

   micro avg       0.60      0.60      0.60      1426
   macro avg       0.60      0.62      0.58      1426
weighted avg       0.69      0.60      0.62      1426



In [0]:
from sklearn.linear_model import PassiveAggressiveClassifier

In [0]:
print('aa')

aa


In [0]:
model = PassiveAggressiveClassifier(C=1.0, class_weight='balanced', multi_class='ovr', random_state=40)
model.fit(hstack([x_train, X_train[['sadness', 'joy', 'fear', 'disgust' , 'anger']].values]), y_train.values.ravel())