In [0]:
!pip install textstat


# Imports and downloads

import pandas as pd
import os
import tensorflow as tf 
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from scipy.sparse import hstack # to concatenate features
from sklearn.svm import LinearSVC
from keras.preprocessing.text import Tokenizer
import nltk
import numpy as np
from nltk.corpus import sentiwordnet as swn, wordnet
from nltk import pos_tag, word_tokenize
from textblob import TextBlob
import textstat
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from gensim.models import KeyedVectors
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download('averaged_perceptron_tagger')
from sklearn.preprocessing import StandardScaler

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

drive.mount("/content/gdrive", force_remount=True)

# Read the data + emotions
df = pd.read_pickle('/content/gdrive/My Drive/datathon/task2data.pkl')

df.describe()



Using TensorFlow backend.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Found GPU at: /device:GPU:0
Mounted at /content/gdrive


Unnamed: 0,article,N_sentence,sadness,joy,fear,disgust,anger
count,14263.0,14263.0,14263.0,14263.0,14263.0,14263.0,14263.0
mean,944064400.0,47.675384,0.257924,0.202204,0.132644,0.171051,0.168878
std,1145074000.0,55.272256,0.177576,0.1922,0.131431,0.160697,0.143378
min,111111100.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,728973000.0,14.0,0.127318,0.052781,0.049964,0.056007,0.067621
50%,761897100.0,29.0,0.22208,0.143389,0.095208,0.11663,0.127791
75%,782017100.0,58.0,0.359406,0.295384,0.169027,0.24133,0.231704
max,7709564000.0,429.0,1.0,1.0,1.0,0.931034,1.0


Total of 14263 sentences.

In [0]:
df.head(10)

Unnamed: 0,sentences,article,N_sentence,is_propaganda,sadness,joy,fear,disgust,anger
0,New Audio From The Night Of The Las Vegas Mass...,704856340,1,non-propaganda,0.093891,0.481352,0.238193,0.133129,0.162108
2,Newly released audio from the Clark County Fir...,704856340,3,non-propaganda,0.456951,0.024205,0.162442,0.419307,0.281755
3,"The audio, released on the SoundCloud account ...",704856340,4,non-propaganda,0.193504,0.280196,0.125014,0.140594,0.36693
4,"Intellihub’s Shepard Ambellas, who has extensi...",704856340,5,non-propaganda,0.674633,0.155427,0.046534,0.116447,0.051066
5,“We have a firefighter’s wife at this event wh...,704856340,6,non-propaganda,0.356464,0.171645,0.098791,0.098959,0.272835
6,"We are trying to get further on the name,” dis...",704856340,7,non-propaganda,0.395951,0.058361,0.141609,0.075802,0.107938
7,"“Batallion 6, be advised that we are getting r...",704856340,8,non-propaganda,0.082155,0.041201,0.110452,0.028624,0.071853
8,"After being asked to confirm the information, ...",704856340,9,non-propaganda,0.457318,0.065556,0.055998,0.069855,0.213325
9,“The only information I have is it’s the bar o...,704856340,10,non-propaganda,0.477513,0.157989,0.067258,0.036316,0.191752
10,"Interestingly, the story doesn’t end there, as...",704856340,11,non-propaganda,0.109102,0.687029,0.089832,0.021488,0.131763


In [0]:
len(df.loc[df['is_propaganda'] == 'propaganda'])

3938

3938 of those are propaganda.

# Feature Extraction

In [0]:
# Number of words
def get_num_words(text):
    
    # Remove special chars and punctuation
    tokens = text.split()
    
    return len(tokens)

# Num of chars
def get_num_chars(text):    
    return len(text)
  
df['num_words'] = df['sentences'].apply(get_num_words)
df['num_chars'] = df['sentences'].apply(get_num_chars)

In [0]:
def count_adjectives(text):
    num_adjs = 0
    tags = pos_tag(word_tokenize(text.lower()))
    for t in tags:
      if t[1].startswith('JJ'):
        num_adjs += 1
    return num_adjs
  
  
def count_adverbs(text):
    num_adv = 0
    tags = pos_tag(word_tokenize(text.lower()))
    for t in tags:
      if t[1].startswith('RB'):
        num_adv += 1
    return num_adv
  
def count_singular_pronouns(text):
    
    sing_pro = 0
    tags = pos_tag(word_tokenize(text.lower()))
    for t in tags:
      if t[1].startswith('NNP'):
        sing_pro += 1
    return sing_pro
  
  
def count_plural_pronouns(text):
  
    plur_pro = 0
    tags = pos_tag(word_tokenize(text.lower()))
    for t in tags:
      if t[1].startswith('NNP'):
        plur_pro += 1
    return plur_pro
  
df['num_adjs'] = df['sentences'].apply(count_adjectives)
df['num_adv'] = df['sentences'].apply(count_adverbs)
df['sing_pro'] = df['sentences'].apply(count_singular_pronouns)
df['pl_pro'] = df['sentences'].apply(count_plural_pronouns)

In [0]:
def is_question(sentence):
    if len(sentence) == 0:
        return 0
    return 1 if sentence[len(sentence) - 1] == '?' else 0

def is_exclamation(sentence):
    if len(sentence) == 0:
        return 0
    return 1 if sentence[len(sentence) - 1] == '!' else 0

def is_period(sentence):
    if len(sentence) == 0:
        return 0
    return 1 if sentence[len(sentence) - 1] == '.' else 0
  
df['is_question'] = df['sentences'].apply(is_question)
df['is_exclamation'] = df['sentences'].apply(is_exclamation)
df['is_period'] = df['sentences'].apply(is_period)

In [0]:
# Polarity feat

def polarity(sentence):
    """
    :param sentences list of sentences
    :return: a list of singleton lists which contain
             a number between -1 and 1,
             where -1 is negative and 1 is positive.
    """
    return TextBlob(sentence).sentiment.polarity

df['polarity'] = df['sentences'].apply(polarity)

In [0]:
# Subjectivity feat

def subjectivity(sentence):
    """
    :param sentences list of sentences
    :return: a list of singleton lists which contain
            a number between 0 and 1,
            where 0 is objective and 1 is subjective.
    """
    return TextBlob(sentence).sentiment.subjectivity

df['subjectivity'] = df['sentences'].apply(subjectivity)

In [0]:
# Readability features

df['flesch_reading_ease'] = df['sentences'].apply(textstat.flesch_reading_ease)
df['smog_index'] = df['sentences'].apply(textstat.smog_index)
df['flesch_kincaid_grade'] = df['sentences'].apply(textstat.flesch_kincaid_grade)
df['coleman_liau_index'] = df['sentences'].apply(textstat.coleman_liau_index)
df['automated_readability_index'] = df['sentences'].apply(textstat.automated_readability_index)
df['dale_chall_readability_score'] = df['sentences'].apply(textstat.dale_chall_readability_score)
df['difficult_words'] = df['sentences'].apply(textstat.difficult_words)
df['linsear_write_formula'] = df['sentences'].apply(textstat.linsear_write_formula)
df['gunning_fog'] = df['sentences'].apply(textstat.gunning_fog)

In [0]:
def lexical_chars(chars, key_wanted):
    char_count = len(chars)

    possible_chars_map = {
        ',': 'comma_count',
        '\n': 'paragraph_count',
        ';': 'semicolon_count',
        ':': 'colon_count',
        ' ': 'spaces_count',
        '\'': 'apostrophes_count',
        '&': 'amp_count'
    }

    possible_chars = possible_chars_map.keys()

    char_analysis = {
        'digits': 0,
        'punctuation_count': 0,
        'comma_count': 0,
        'semicolon_count': 0,
        'colon_count': 0,
        'spaces_count': 0,
        'apostrophes_count': 0,
        'amp_count': 0,
        'parenthesis_count': 0,
        'paragraph_count': 0
    }

    for char in chars:
        if char in possible_chars:
            char_analysis[possible_chars_map[char]] += 1
        elif char.isdigit(): char_analysis['digits'] += 1
        elif char in ['(', ')']: char_analysis['parenthesis_count'] += 1
        if char in '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~': char_analysis['punctuation_count'] += 1

    return char_analysis[key_wanted]/char_count

df['digits'] = df['sentences'].apply(lambda x: lexical_chars(x, 'digits'))
df['punctuation_count'] = df['sentences'].apply(lambda x: lexical_chars(x, 'punctuation_count'))
df['comma_count'] = df['sentences'].apply(lambda x: lexical_chars(x, 'comma_count'))
df['semicolon_count'] = df['sentences'].apply(lambda x: lexical_chars(x, 'semicolon_count'))
df['colon_count'] = df['sentences'].apply(lambda x: lexical_chars(x, 'colon_count'))
df['spaces_count'] = df['sentences'].apply(lambda x: lexical_chars(x, 'spaces_count'))
df['apostrophes_count'] = df['sentences'].apply(lambda x: lexical_chars(x, 'apostrophes_count'))
df['amp_count'] = df['sentences'].apply(lambda x: lexical_chars(x, 'amp_count'))
df['parenthesis_count'] = df['sentences'].apply(lambda x: lexical_chars(x, 'parenthesis_count'))
df['paragraph_count'] = df['sentences'].apply(lambda x: lexical_chars(x, 'paragraph_count'))

In [0]:
def lexical_words(sent, key_wanted):
    
    word_count = len(sent)
    entry_word = word_tokenize(sent)
    words_tagged = pos_tag(entry_word)
    
    word_analysis = {
        'pronouns': 0,
        'prepositions': 0,
        'coordinating_conjunctions': 0,
        'adjectives': 0,
        'adverbs': 0,
        'determiners': 0,
        'interjections': 0,
        'modals': 0,
        'nouns': 0,
        'personal_pronouns': 0,
        'verbs': 0,
        'word_len_gte_six': 0,
        'word_len_two_and_three': 0,
        'avg_word_length': 0,
        'all_caps': 0,
        'capitalized': 0,
        'quotes_count': 0,
    }

    for (word, tag) in words_tagged:
        if tag in ['PRP']: word_analysis['personal_pronouns'] += 1
        if tag.startswith('J'): word_analysis['adjectives'] += 1
        if tag.startswith('N'): word_analysis['nouns'] += 1
        if tag.startswith('V'): word_analysis['verbs'] += 1
        if tag in ['PRP', 'PRP$', 'WP', 'WP$']: word_analysis['pronouns'] += 1
        elif tag in ['IN']: word_analysis['prepositions'] += 1
        elif tag in ['CC']: word_analysis['coordinating_conjunctions'] += 1
        elif tag in ['RB', 'RBR', 'RBS']: word_analysis['adverbs'] += 1
        elif tag in ['DT', 'PDT', 'WDT']: word_analysis['determiners'] += 1
        elif tag in ['UH']: word_analysis['interjections'] += 1
        elif tag in ['MD']: word_analysis['modals'] += 1
        if len(word) >= 6: word_analysis['word_len_gte_six'] += 1
        elif len(word) in [2, 3]: word_analysis['word_len_two_and_three'] += 1
        word_analysis['avg_word_length'] += len(word)
        if word.isupper(): word_analysis['all_caps'] += 1
        elif word[0].isupper(): word_analysis['capitalized'] += 1
        word_analysis['quotes_count'] += word.count('"') + word.count('`') + word.count('\'')

    return word_analysis[key_wanted]/word_count

df['pronouns'] = df['sentences'].apply(lambda x: lexical_words(x, 'pronouns'))
df['prepositions'] = df['sentences'].apply(lambda x: lexical_words(x, 'prepositions'))
df['coordinating_conjunctions'] = df['sentences'].apply(lambda x: lexical_words(x, 'coordinating_conjunctions'))
df['adjectives'] = df['sentences'].apply(lambda x: lexical_words(x, 'adjectives'))
df['adverbs'] = df['sentences'].apply(lambda x: lexical_words(x, 'adverbs'))
df['determiners'] = df['sentences'].apply(lambda x: lexical_words(x, 'determiners'))
df['interjections'] = df['sentences'].apply(lambda x: lexical_words(x, 'interjections'))
df['modals'] = df['sentences'].apply(lambda x: lexical_words(x, 'modals'))
df['nouns'] = df['sentences'].apply(lambda x: lexical_words(x, 'nouns'))
df['personal_pronouns'] = df['sentences'].apply(lambda x: lexical_words(x, 'personal_pronouns'))
df['verbs'] = df['sentences'].apply(lambda x: lexical_words(x, 'verbs'))
df['word_len_gte_six'] = df['sentences'].apply(lambda x: lexical_words(x, 'word_len_gte_six'))
df['word_len_two_and_three'] = df['sentences'].apply(lambda x: lexical_words(x, 'word_len_two_and_three'))
df['avg_word_length'] = df['sentences'].apply(lambda x: lexical_words(x, 'avg_word_length'))
df['all_caps'] = df['sentences'].apply(lambda x: lexical_words(x, 'all_caps'))
df['capitalized'] = df['sentences'].apply(lambda x: lexical_words(x, 'capitalized'))
df['quotes_count'] = df['sentences'].apply(lambda x: lexical_words(x, 'quotes_count'))

In [0]:
def compute_score(sentence):
    sent_score = []
    words = pos_tag(sentence.split())
    for t in words:
        word = t[0]
        tag = t[1]
        new_tag = ''
        if tag.startswith('NN'):
            new_tag = wordnet.NOUN
        elif tag.startswith('J'):
            new_tag = wordnet.ADJ
        elif tag.startswith('V'):
            new_tag = wordnet.VERB
        elif tag.startswith('R'):
            new_tag = wordnet.ADV

        if new_tag != '':
            synsets = list(swn.senti_synsets(word, new_tag))
            score = 0.0
            if len(synsets) > 0:
                for syn in synsets:
                    score += syn.pos_score() - syn.neg_score()
                sent_score.append(score / len(synsets))

    if len(sent_score)==0:
        return float(0.0)
    else:
        return np.mean(sent_score)

df['sentiment_score'] = df['sentences'].apply(compute_score)

In [0]:
def normalized_number_meanings(sentence):
    transformed = []
    tokens = word_tokenize(sentence)
    words_tagged = pos_tag(tokens)
    word_vector = [0,0,0,0]

    # 0 - verbs, 1 - adjectives, 2 - nouns, 3 - adverbs
    confusing_verb_count = 0
    for word, tag in words_tagged:
      if (tag.startswith('VB')):
        word_vector[0] += len(wordnet.synsets(word, pos=wordnet.VERB))
      elif (tag.startswith('JJ')):
        word_vector[1] += len(wordnet.synsets(word, pos=wordnet.ADJ))
      elif (tag.startswith('NN')):
        word_vector[2] += len(wordnet.synsets(word, pos=wordnet.NOUN))
      elif (tag.startswith('RB')):
        word_vector[3] += len(wordnet.synsets(word, pos=wordnet.ADV))    
    
    return (word_vector[0] + word_vector[1] + word_vector[2] + word_vector[3])/len(words_tagged)
  
df['normalized_meanings'] = df['sentences'].apply(normalized_number_meanings)

In [0]:
## Number of unique words in the text ##
df["num_unique_words"] = df["sentences"].apply(lambda x: len(set(str(x).split())))

## Number of stopwords in the text ##
df["num_stop_words"] = df["sentences"].apply(lambda x: len([w for w in str(x).lower().split() if w in ENGLISH_STOP_WORDS]))

## Number of punctuations in the text ##
# df["num_punctuations"] = df['sentences'].apply(lambda x: len([c for c in str(x) if c in str.punctuation]))

## Number of title case words in the text ##
df["num_words_upper"] = df["sentences"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Number of title case words in the text ##
df["num_words_title"] = df["sentences"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))


## Average length of the words in the text ##
df["mean_word_len"] = df["sentences"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [0]:
# w2v = KeyedVectors.load_word2vec_format('/content/gdrive/My Drive/datathon/GoogleNews-vectors-negative300.bin', binary=True)

# def avg_google_news_vec(sentence):
#   doc = [word for word in sentence if word in w2v.vocab]
#   return np.mean(w2v[doc], axis=0)

# df['avg_google_news_vec'] = df['sentences'].apply(avg_google_news_vec)

FileNotFoundError: ignored

In [0]:
with open('/content/gdrive/My Drive/datathon/loaded_language_phrases.txt') as f_loaded_lang:
  loaded_lang=[loaded_word.rstrip() for loaded_word in f_loaded_lang.readlines()]

def loaded_language_count(sent):
  llc = 0
  sent = sent.lower()
  for loaded_word in loaded_lang:
    if loaded_word in sent:
      llc += 1
  return llc
      
df['llc'] = df['sentences'].apply(loaded_language_count)

In [0]:
with open('/content/gdrive/My Drive/datathon/exclamation_words.txt') as f_exlamation_words:
  excl_words=[excl_word.rstrip() for excl_word in f_exlamation_words.readlines()]

def excl_words_count(sent):
  ewc = 0
  sent = sent.lower().split()
  for w in sent:
    if w in excl_words:
      ewc += 1
  return ewc
      
df['ewc'] = df['sentences'].apply(excl_words_count)

In [0]:
df.shape

(14263, 58)

In [0]:
df.head(1)

Unnamed: 0,sentences,article,N_sentence,is_propaganda,sadness,joy,fear,disgust,anger,num_words,num_chars,num_adjs,num_adv,sing_pro,pl_pro,is_question,is_exclamation,is_period,polarity,subjectivity,flesch_reading_ease,smog_index,flesch_kincaid_grade,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,digits,punctuation_count,comma_count,semicolon_count,colon_count,spaces_count,apostrophes_count,amp_count,parenthesis_count,paragraph_count,pronouns,prepositions,coordinating_conjunctions,adjectives,adverbs,determiners,interjections,modals,nouns,personal_pronouns,verbs,word_len_gte_six,word_len_two_and_three,avg_word_length,all_caps,capitalized,quotes_count,sentiment_score,normalized_meanings,num_unique_words,num_stop_words,num_words_upper,num_words_title,mean_word_len,llc,target
0,New Audio From The Night Of The Las Vegas Massacre Reveals That There Was “Another Active Shooter” In The Bar At The Top Of The Mandalay Bay Hotel During The Attack,704856340,1,non-propaganda,0.093891,0.481352,0.238193,0.133129,0.162108,0.721268,0.460634,3,0,0,0,0,0,0,0.167677,0.518182,56.93,0.0,13.0,8.37,14.4,8.74,7,16.5,21.43,8th and 9th grade,0.0,0.0,0.0,0.0,0.0,0.182927,0.0,0.0,0.0,0.0,0.0,0.042683,0.0,0.0,0.0,0.042683,0.0,0.0,0.103659,0.0,0.006098,0.04878,0.097561,0.817073,0.0,0.189024,0.0,-0.015672,2.242424,25,16,0,31,4.322581,0,0


In [0]:
df.columns

Index(['sentences', 'article', 'N_sentence', 'is_propaganda', 'sadness', 'joy',
       'fear', 'disgust', 'anger', 'num_words', 'num_chars', 'num_adjs',
       'num_adv', 'sing_pro', 'pl_pro', 'is_question', 'is_exclamation',
       'is_period', 'polarity', 'subjectivity', 'flesch_reading_ease',
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog', 'digits',
       'punctuation_count', 'comma_count', 'semicolon_count', 'colon_count',
       'spaces_count', 'apostrophes_count', 'amp_count', 'parenthesis_count',
       'paragraph_count', 'pronouns', 'prepositions',
       'coordinating_conjunctions', 'adjectives', 'adverbs', 'determiners',
       'interjections', 'modals', 'nouns', 'personal_pronouns', 'verbs',
       'word_len_gte_six', 'word_len_two_and_three', 'avg_word_length',
       'all_caps', 'capitalized', 'quotes_count', 'sentiment

In [0]:
SEED = 666

In [0]:
df['target'] = df['is_propaganda'].map({'propaganda': 1, 'non-propaganda': 0})
df.drop(['is_propaganda'], axis=1, inplace=True)

In [0]:
y = df[['target']]

X_train, X_rest, y_train, y_rest = train_test_split(
        df, y,stratify=y, test_size=0.2, random_state=SEED)

print(X_train.shape)


X_val, X_test, y_val, y_test = train_test_split(
        X_rest, y_rest,stratify=y_rest, test_size=0.5, random_state=SEED)

print(X_val.shape)

(11410, 65)
(1426, 65)


In [0]:
scaler = StandardScaler()

X_train[['num_words', 'num_chars']] = scaler.fit_transform(X_train[['num_words', 'num_chars']])
X_val[['num_words', 'num_chars']] = scaler.transform(X_val[['num_words', 'num_chars']])
X_test[['num_words', 'num_chars']] = scaler.transform(X_test[['num_words', 'num_chars']])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.

In [0]:
X_train.head()

Unnamed: 0,sentences,article,N_sentence,is_propaganda,sadness,joy,fear,disgust,anger,num_words,...,quotes_count,sentiment_score,normalized_meanings,num_unique_words,num_stop_words,num_words_upper,num_words_title,mean_word_len,llc,target
31,The Nuncio added that Littleton had already fo...,782086447,32,non-propaganda,0.299419,0.027867,0.243245,0.177422,0.064517,41,...,0.0,0.022667,3.282609,37,22,0,5,5.414634,0,0
21,FACT asked the Senate ethics committee to prob...,999000147,22,non-propaganda,0.38935,0.018483,0.065846,0.36244,0.294696,21,...,0.0,0.03022,2.12,20,4,1,9,6.047619,0,0
8,“There is a silence among many who call themse...,763260610,9,non-propaganda,0.688505,0.061684,0.211507,0.057255,0.11288,10,...,0.0,-0.008757,5.0,10,7,0,1,4.7,0,0
42,"The President of AMANA, Sofian Zakkout, has re...",728169864,43,propaganda,0.063294,0.450346,0.035087,0.310004,0.138506,18,...,0.0,0.099388,2.291667,17,7,1,7,4.5,0,1
81,Date of erection: 1994,761334950,82,non-propaganda,0.0442,0.195751,0.011521,0.15943,0.030789,4,...,0.0,-0.015625,2.2,4,1,0,1,4.75,0,0


In [0]:
# Maybe try different params/vectorisers, stemming etc.? - got these from the winning system for task 1.
vectorizer = TfidfVectorizer(min_df = 3, max_df=0.5,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

x_train = vectorizer.fit_transform(X_train['sentences'])
x_val = vectorizer.transform(X_val['sentences'])

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

pipe_lrSVC = Pipeline([('scaler', StandardScaler()),('clf', LinearSVC(C=1.0, class_weight='balanced', multi_class='ovr', random_state=SEED))])
pipe_lrSVC.fit(X_train[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'num_words', 'num_chars', 'num_adjs',
       'num_adv', 'sing_pro', 'pl_pro', 'is_question', 'is_exclamation',
       'is_period', 'polarity', 'subjectivity', 'flesch_reading_ease',
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog', 'digits',
       'punctuation_count', 'comma_count', 'semicolon_count', 'colon_count',
       'spaces_count', 'apostrophes_count', 'amp_count', 'parenthesis_count',
       'paragraph_count', 'pronouns', 'prepositions',
       'coordinating_conjunctions', 'adjectives', 'adverbs', 'determiners',
       'interjections', 'modals', 'nouns', 'personal_pronouns', 'verbs',
       'word_len_gte_six', 'word_len_two_and_three', 'avg_word_length',
       'all_caps', 'capitalized', 'quotes_count', 'sentiment_score',
       'normalized_meanings', 'num_unique_words', 'num_stop_words',
       'num_words_upper', 'num_words_title', 'mean_word_len']], y_train)
y_pred = pipe_lrSVC.predict(X_val[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'num_words', 'num_chars', 'num_adjs',
       'num_adv', 'sing_pro', 'pl_pro', 'is_question', 'is_exclamation',
       'is_period', 'polarity', 'subjectivity', 'flesch_reading_ease',
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog', 'digits',
       'punctuation_count', 'comma_count', 'semicolon_count', 'colon_count',
       'spaces_count', 'apostrophes_count', 'amp_count', 'parenthesis_count',
       'paragraph_count', 'pronouns', 'prepositions',
       'coordinating_conjunctions', 'adjectives', 'adverbs', 'determiners',
       'interjections', 'modals', 'nouns', 'personal_pronouns', 'verbs',
       'word_len_gte_six', 'word_len_two_and_three', 'avg_word_length',
       'all_caps', 'capitalized', 'quotes_count', 'sentiment_score',
       'normalized_meanings', 'num_unique_words', 'num_stop_words',
       'num_words_upper', 'num_words_title', 'mean_word_len']])

  y = column_or_1d(y, warn=True)


In [0]:
from sklearn.metrics import f1_score

In [0]:
# NN without scaling

print(f1_score(y_val.values.ravel(), y_pred))

0.3009523809523809


In [0]:
# NN with scaling

print(f1_score(y_val.values.ravel(), y_pred))

0.37623762376237624


In [0]:
# SVM without scaling

print(f1_score(y_val.values.ravel(), y_pred))

0.4428002276607854


In [0]:
# SVM with scaling

print(f1_score(y_val.values.ravel(), y_pred))

0.5004926108374385


In [0]:
# NN without tf idf

print(classification_report(y_val.values.ravel(), y_pred))

In [0]:
# NN without tf idf

print(classification_report(y_val.values.ravel(), y_pred))

              precision    recall  f1-score   support

           0       0.77      0.83      0.79      1032
           1       0.42      0.34      0.38       394

   micro avg       0.69      0.69      0.69      1426
   macro avg       0.60      0.58      0.59      1426
weighted avg       0.67      0.69      0.68      1426



In [0]:
# NN without tf idf no scale

print(classification_report(y_val.values.ravel(), y_pred))

              precision    recall  f1-score   support

           0       0.76      0.95      0.84      1032
           1       0.60      0.20      0.30       394

   micro avg       0.74      0.74      0.74      1426
   macro avg       0.68      0.58      0.57      1426
weighted avg       0.71      0.74      0.69      1426



In [0]:
# SVM except tf idf
print(classification_report(y_val.values.ravel(), y_pred))

              precision    recall  f1-score   support

           0       0.82      0.65      0.72      1032
           1       0.41      0.64      0.50       394

   micro avg       0.64      0.64      0.64      1426
   macro avg       0.62      0.64      0.61      1426
weighted avg       0.71      0.64      0.66      1426



In [0]:
# All feats
#hstack([x_train, X_train[['sadness', 'joy',
#        'fear', 'disgust', 'anger', 'num_words', 'num_chars', 'num_adjs',
#        'num_adv', 'sing_pro', 'pl_pro', 'is_question', 'is_exclamation',
#        'is_period', 'polarity', 'subjectivity', 'flesch_reading_ease',
#        'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
#        'automated_readability_index', 'dale_chall_readability_score',
#        'difficult_words', 'linsear_write_formula', 'gunning_fog',
#        'digits', 'punctuation_count', 'comma_count',
#        'semicolon_count', 'colon_count', 'spaces_count', 'apostrophes_count',
#        'amp_count', 'parenthesis_count', 'paragraph_count', 'pronouns',
#        'prepositions', 'coordinating_conjunctions', 'adjectives', 'adverbs',
#        'determiners', 'interjections', 'modals', 'nouns', 'personal_pronouns',
#        'verbs', 'word_len_gte_six', 'word_len_two_and_three',
#        'avg_word_length', 'all_caps', 'capitalized', 'quotes_count',
#        'sentiment_score', 'normalized_meanings', 'num_unique_words',
#        'num_stop_words', 'num_words_upper', 'num_words_title', 'mean_word_len',
#        'llc']].values

In [0]:
model = LinearSVC(C=1.0, class_weight='balanced', multi_class='ovr', random_state=SEED)
# Train only with text data
model.fit(x_train, y_train.values.ravel())


# Or with more features
# model.fit(hstack([x_train, X_train[['sadness', 'joy', 'fear', 'disgust' , 'anger']].values]), y_train.values.ravel())

# model.fit(hstack([x_train, X_train[['sadness', 'joy',
#        'fear', 'disgust', 'anger', 'polarity', 'subjectivity', 'llc', 'is_question', 'is_exclamation',
#        'is_period', 'flesch_reading_ease',
#        'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
#        'automated_readability_index', 'dale_chall_readability_score',
#        'difficult_words', 'linsear_write_formula', 'gunning_fog']].values]), y_train.values.ravel())

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=666, tol=0.0001,
     verbose=0)

In [0]:
#pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', 500)


In [0]:
# Only tf-idf

y_val_predict = model.predict(x_val)
print(f1_score(y_val.values.ravel(), y_val_predict))

0.506508875739645


In [0]:
# Only tf-idf

y_val_predict = model.predict(x_val)
print(classification_report(y_val.values.ravel(), y_val_predict))

              precision    recall  f1-score   support

           0       0.82      0.77      0.79      1032
           1       0.47      0.54      0.51       394

   micro avg       0.71      0.71      0.71      1426
   macro avg       0.64      0.66      0.65      1426
weighted avg       0.72      0.71      0.71      1426



In [0]:
# All features
y_val_predict = model.predict(hstack([x_val, X_val[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'num_words', 'num_chars', 'num_adjs',
       'num_adv', 'sing_pro', 'pl_pro', 'is_question', 'is_exclamation',
       'is_period', 'polarity', 'subjectivity', 'flesch_reading_ease',
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog',
       'digits', 'punctuation_count', 'comma_count',
       'semicolon_count', 'colon_count', 'spaces_count', 'apostrophes_count',
       'amp_count', 'parenthesis_count', 'paragraph_count', 'pronouns',
       'prepositions', 'coordinating_conjunctions', 'adjectives', 'adverbs',
       'determiners', 'interjections', 'modals', 'nouns', 'personal_pronouns',
       'verbs', 'word_len_gte_six', 'word_len_two_and_three',
       'avg_word_length', 'all_caps', 'capitalized', 'quotes_count',
       'sentiment_score', 'normalized_meanings', 'num_unique_words',
       'num_stop_words', 'num_words_upper', 'num_words_title', 'mean_word_len',
       'llc']].values]))

print(classification_report(y_val.values.ravel(), y_val_predict))


              precision    recall  f1-score   support

           0       0.83      0.66      0.73      1032
           1       0.42      0.63      0.50       394

   micro avg       0.65      0.65      0.65      1426
   macro avg       0.62      0.65      0.62      1426
weighted avg       0.71      0.65      0.67      1426



In [0]:
# Tf-idf + emotions

y_val_predict = model.predict(hstack([x_val, X_val[['sadness', 'joy',
       'fear', 'disgust', 'anger']].values]))

print(classification_report(y_val.values.ravel(), y_val_predict))


              precision    recall  f1-score   support

           0       0.82      0.77      0.79      1032
           1       0.48      0.55      0.51       394

   micro avg       0.71      0.71      0.71      1426
   macro avg       0.65      0.66      0.65      1426
weighted avg       0.72      0.71      0.72      1426



In [0]:
# Tf-idf + emotions + polarity

y_val_predict = model.predict(hstack([x_val, X_val[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'polarity']].values]))

print(classification_report(y_val.values.ravel(), y_val_predict))


              precision    recall  f1-score   support

           0       0.82      0.77      0.79      1032
           1       0.48      0.55      0.51       394

   micro avg       0.71      0.71      0.71      1426
   macro avg       0.65      0.66      0.65      1426
weighted avg       0.72      0.71      0.72      1426



In [0]:
# Tf-idf + emotions + polarity + subjectivity

y_val_predict = model.predict(hstack([x_val, X_val[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'polarity', 'subjectivity']].values]))

print(classification_report(y_val.values.ravel(), y_val_predict))


              precision    recall  f1-score   support

           0       0.82      0.78      0.80      1032
           1       0.49      0.55      0.52       394

   micro avg       0.71      0.71      0.71      1426
   macro avg       0.65      0.66      0.66      1426
weighted avg       0.73      0.71      0.72      1426



In [0]:
# Tf-idf + emotions + polarity + subjectivity + loaded language

y_val_predict = model.predict(hstack([x_val, X_val[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'polarity', 'subjectivity', 'llc']].values]))

print(classification_report(y_val.values.ravel(), y_val_predict))


              precision    recall  f1-score   support

           0       0.82      0.78      0.80      1032
           1       0.49      0.57      0.53       394

   micro avg       0.72      0.72      0.72      1426
   macro avg       0.66      0.67      0.66      1426
weighted avg       0.73      0.72      0.72      1426



In [0]:
# Tf-idf + emotions + polarity + subjectivity + loaded language

y_val_predict = model.predict(hstack([x_val, X_val[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'polarity', 'subjectivity', 'llc', 'num_words', 'num_chars']].values]))

print(classification_report(y_val.values.ravel(), y_val_predict))


              precision    recall  f1-score   support

           0       0.82      0.79      0.80      1032
           1       0.50      0.55      0.52       394

   micro avg       0.72      0.72      0.72      1426
   macro avg       0.66      0.67      0.66      1426
weighted avg       0.73      0.72      0.73      1426



In [0]:
# Tf-idf + emotions + polarity + subjectivity + loaded language

y_val_predict = model.predict(hstack([x_val, X_val[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'polarity', 'subjectivity', 'llc', 'is_question', 'is_exclamation',
       'is_period']].values]))

print(classification_report(y_val.values.ravel(), y_val_predict))


              precision    recall  f1-score   support

           0       0.83      0.79      0.80      1032
           1       0.50      0.56      0.53       394

   micro avg       0.72      0.72      0.72      1426
   macro avg       0.66      0.67      0.67      1426
weighted avg       0.74      0.72      0.73      1426



In [0]:
# Tf-idf + emotions + polarity + subjectivity + loaded language

y_val_predict = model.predict(hstack([x_val, X_val[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'polarity', 'subjectivity', 'llc', 'is_question', 'is_exclamation',
       'is_period', 'normalized_meanings']].values]))

print(classification_report(y_val.values.ravel(), y_val_predict))


              precision    recall  f1-score   support

           0       0.82      0.78      0.80      1032
           1       0.49      0.56      0.52       394

   micro avg       0.72      0.72      0.72      1426
   macro avg       0.65      0.67      0.66      1426
weighted avg       0.73      0.72      0.72      1426



In [0]:

# Tf-idf + emotions + polarity + subjectivity + loaded language

y_val_predict = model.predict(hstack([x_val, X_val[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'polarity', 'subjectivity', 'llc', 'is_question', 'is_exclamation',
       'is_period', 'flesch_reading_ease',
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog'
      ]].values]))

print(classification_report(y_val.values.ravel(), y_val_predict))


              precision    recall  f1-score   support

           0       0.77      0.90      0.83      1032
           1       0.51      0.28      0.36       394

   micro avg       0.73      0.73      0.73      1426
   macro avg       0.64      0.59      0.59      1426
weighted avg       0.70      0.73      0.70      1426



In [0]:

# Tf-idf + emotions + polarity + subjectivity + loaded language

y_val_predict = model.predict(hstack([x_val, X_val[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'polarity', 'subjectivity', 'llc', 'is_question', 'is_exclamation',
       'is_period', 'flesch_reading_ease',
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog'
      ]].values]))

print(classification_report(y_val.values.ravel(), y_val_predict))


              precision    recall  f1-score   support

           0       0.85      0.62      0.71      1032
           1       0.42      0.71      0.53       394

   micro avg       0.64      0.64      0.64      1426
   macro avg       0.63      0.67      0.62      1426
weighted avg       0.73      0.64      0.66      1426



In [0]:
for blah in ['sadness', 'joy',
       'fear', 'disgust', 'anger', 'num_words', 'num_chars', 'num_adjs',
       'num_adv', 'sing_pro', 'pl_pro', 'is_question', 'is_exclamation',
       'is_period', 'polarity', 'subjectivity', 'flesch_reading_ease',
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog',
       'digits', 'punctuation_count', 'comma_count',
       'semicolon_count', 'colon_count', 'spaces_count', 'apostrophes_count',
       'amp_count', 'parenthesis_count', 'paragraph_count', 'pronouns',
       'prepositions', 'coordinating_conjunctions', 'adjectives', 'adverbs',
       'determiners', 'interjections', 'modals', 'nouns', 'personal_pronouns',
       'verbs', 'word_len_gte_six', 'word_len_two_and_three',
       'avg_word_length', 'all_caps', 'capitalized', 'quotes_count',
       'sentiment_score', 'normalized_meanings', 'num_unique_words',
       'num_stop_words', 'num_words_upper', 'num_words_title', 'mean_word_len',
       'llc']:

  model = LinearSVC(C=1.0, class_weight='balanced', multi_class='ovr', random_state=SEED)
  # Train only with text data
  print(blah)
  model.fit(x_train, y_train.values.ravel())

  
  # Or with more features
  # model.fit(hstack([x_train, X_train[['sadness', 'joy', 'fear', 'disgust' , 'anger']].values]), y_train.values.ravel())

  model.fit(X_train[[blah]].values, y_train.values.ravel())

sadness
joy
fear




disgust
anger
num_words
num_chars
num_adjs
num_adv




sing_pro




pl_pro




is_question
is_exclamation




is_period
polarity




subjectivity
flesch_reading_ease




smog_index




flesch_kincaid_grade




coleman_liau_index




automated_readability_index




dale_chall_readability_score
difficult_words
linsear_write_formula




gunning_fog
digits




punctuation_count




comma_count




semicolon_count




colon_count




spaces_count
apostrophes_count




amp_count




parenthesis_count




paragraph_count
pronouns




prepositions




coordinating_conjunctions




adjectives




adverbs




determiners




interjections




modals




nouns
personal_pronouns




verbs




word_len_gte_six
word_len_two_and_three




avg_word_length




all_caps




capitalized




quotes_count




sentiment_score




normalized_meanings




num_unique_words
num_stop_words
num_words_upper




num_words_title




mean_word_len




llc




In [0]:
X_train.head(10)

Unnamed: 0,sentences,article,N_sentence,is_propaganda,sadness,joy,fear,disgust,anger,num_words,num_chars,num_adjs,num_adv,sing_pro,pl_pro,is_question,is_exclamation,is_period,polarity,subjectivity,flesch_reading_ease,smog_index,flesch_kincaid_grade,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,digits,punctuation_count,comma_count,semicolon_count,colon_count,spaces_count,apostrophes_count,amp_count,parenthesis_count,paragraph_count,pronouns,prepositions,coordinating_conjunctions,adjectives,adverbs,determiners,interjections,modals,nouns,personal_pronouns,verbs,word_len_gte_six,word_len_two_and_three,avg_word_length,all_caps,capitalized,quotes_count,sentiment_score,normalized_meanings,num_unique_words,num_stop_words,num_words_upper,num_words_title,mean_word_len,llc,target
31,"The Nuncio added that Littleton had already forwarded his Memorandum to about twenty people, including civil and ecclesiastical judicial authorities, police and lawyers, in June 2006, and that it was therefore very likely that the news would soon be made public.",782086447,32,non-propaganda,0.235276,-0.911798,0.848839,0.053167,-0.723438,1.405955,1.559952,1.903499,2.510659,-0.409571,-0.409571,-0.25546,-0.118878,0.565002,-0.19199,0.754552,-1.132919,-0.040635,1.493139,0.652393,1.404653,0.381251,1.072258,1.577896,0.948402,20th and 21st grade,0.211,-0.209878,0.746223,-0.093158,-0.17347,-0.030168,-0.177548,-0.048092,-0.095577,0.0,-0.267249,-0.012941,0.881524,0.686396,0.406458,-0.692975,-0.045957,0.227527,-0.589289,-0.324255,-0.223071,0.315972,-0.528557,0.007077,-0.252268,-0.493434,-0.205636,0.363518,0.199365,1.555694,1.675983,-0.444427,0.496643,0.297889,-0.414111,0
21,"FACT asked the Senate ethics committee to probe fundraising emails sent by Ms. Warren, Massachusetts Democrat, and Ms. Harris, California Democrat.",999000147,22,non-propaganda,0.743346,-0.960339,-0.507128,1.21575,0.886222,0.026341,0.266129,-0.518542,-0.834346,-0.409571,-0.409571,-0.25546,-0.118878,0.565002,-0.19199,-0.997068,-0.069015,17.823058,-0.426289,0.54097,-0.228516,0.528296,0.620352,-0.916639,-0.009407,9th and 10th grade,-0.20809,0.089794,1.248788,-0.093158,-0.17347,-0.54499,-0.177548,-0.048092,-0.095577,0.0,-0.754521,-0.918709,0.260994,-0.825508,-0.613536,-0.746294,-0.045957,-0.37445,0.592356,-0.597938,-0.19668,0.972127,-0.660229,0.513972,0.212546,0.761029,-0.205636,0.475084,-0.607795,0.118909,-0.756139,0.981187,1.761885,0.817716,-0.414111,0
8,“There is a silence among many who call themselves left.,763260610,9,non-propaganda,2.433438,-0.736872,0.606246,-0.701917,-0.385232,-0.732447,-0.757679,-0.518542,-0.834346,-0.409571,-0.409571,-0.25546,-0.118878,0.565002,0.976212,-0.175996,1.141325,-0.040635,-1.104583,-0.202747,-0.800125,-0.785921,-0.961321,-0.86467,-1.193693,7th and 8th grade,-0.20809,-0.226795,-0.746244,-0.093158,-0.17347,0.218998,-0.177548,-0.048092,-0.095577,0.0,1.525213,-0.103422,-0.647268,0.353417,-0.613536,-0.036899,-0.045957,-0.37445,-0.559507,0.682506,1.213547,-0.892961,-0.794986,-0.238253,-0.252268,-0.529955,-0.205636,-0.100648,1.391691,-0.726259,-0.350785,-0.444427,-0.768598,-0.288991,-0.414111,0
42,"The President of AMANA, Sofian Zakkout, has referred to Duke as “David Duke, a man to believe in!”",728169864,43,propaganda,-1.098724,1.27357,-0.742237,0.886262,-0.206027,-0.180601,-0.285152,-0.518542,-0.834346,-0.409571,-0.409571,-0.25546,-0.118878,-1.769904,-0.19199,-0.997068,0.630183,-0.040635,-0.484016,-0.292691,-0.438495,-0.072139,-0.283462,-0.344975,-0.239423,8th and 9th grade,-0.20809,0.089794,2.246304,-0.093158,-0.17347,0.614164,-0.177548,-0.048092,-0.095577,0.0,-0.754521,0.837294,-0.647268,-0.825508,-0.613536,0.126807,-0.045957,-0.37445,0.88961,-0.597938,-0.014715,-0.163144,0.417833,-0.627334,0.444953,1.064789,-0.205636,1.496778,-0.488613,-0.134642,-0.350785,0.981187,1.129264,-0.453238,-0.414111,1
81,Date of erection: 1994,761334950,82,non-propaganda,-1.206597,-0.04338,-0.922366,-0.059888,-0.959301,-1.146331,-1.1402,-1.00295,-0.834346,-0.409571,-0.409571,-0.25546,-0.118878,-1.769904,-0.19199,-0.997068,1.068261,-0.040635,-1.277765,-0.876657,-1.126759,-0.151788,-0.961321,-1.176487,-0.769049,2nd and 3rd grade,4.782887,0.153752,-0.746244,-0.093158,4.847718,-0.53541,-0.177548,-0.048092,-0.095577,0.0,-0.754521,1.931946,-0.647268,-0.825508,-0.613536,-1.182845,-0.045957,-0.37445,0.646402,-0.597938,-1.652398,-0.428532,-0.537722,0.504539,-0.252268,0.29158,-0.205636,-0.202099,-0.552254,-1.233359,-1.161493,-0.444427,-0.768598,-0.24793,-0.414111,0
7,"Last year, eight people were infected and four people died after an outbreak of the disease.",754111899,8,non-propaganda,3.048578,-0.632714,0.130715,-0.737181,-0.699186,-0.318563,-0.352656,-0.518542,-0.834346,-0.409571,-0.409571,-0.25546,-0.118878,0.565002,-0.19199,-0.778115,0.431954,-0.040635,-0.426289,0.026812,-0.391833,-0.271262,-0.509415,-0.448914,-0.497748,7th and 8th grade,-0.20809,-0.173265,0.316327,-0.093158,-0.17347,0.291158,-0.177548,-0.048092,-0.095577,0.0,-0.754521,0.182883,0.803977,-0.107901,-0.613536,0.212219,-0.045957,-0.37445,-0.152397,-0.597938,0.092091,-0.004488,-0.58992,-0.309302,-0.252268,-0.737965,-0.205636,-0.527843,-0.112552,-0.303675,-0.08055,-0.444427,-0.768598,-0.196603,-0.414111,0
27,"However, since January 2010, the Bureau of Labor Statistics [BLS] has begun to publish figures for foreign-born and native-born employment.",761969038,28,non-propaganda,-0.194282,-0.418951,0.092696,-0.768592,-0.176013,-0.04264,0.176124,0.450275,0.001906,-0.409571,-0.409571,-0.25546,-0.118878,0.565002,-0.19199,-0.997068,-0.997272,-0.040635,0.641663,0.813487,0.494745,0.81626,0.620352,0.382598,0.693615,14th and 15th grade,0.581849,0.22139,0.660325,-0.093158,-0.17347,-0.525279,-0.177548,-0.048092,-0.095577,0.0,-0.754521,0.171349,0.313268,0.124417,-0.132892,-0.721169,-0.045957,-0.37445,0.074833,-0.597938,-0.497772,0.491416,-0.408164,0.494564,0.239298,0.009275,-0.205636,-0.120527,-0.219006,0.118909,-0.350785,0.981187,0.496643,0.77861,-0.414111,0
30,This marks the country’s ninth epidemic since the ebola virus was identified in 1976.,756114837,31,non-propaganda,1.250405,-0.970534,3.870506,-0.544336,-0.931706,-0.456524,-0.43141,-0.034134,-0.834346,1.007411,1.007411,-0.25546,-0.118878,0.565002,-0.19199,-0.997068,0.494536,-0.040635,-0.541743,0.281877,-0.32184,0.172937,-0.283462,-0.448914,-0.128543,6th and 7th grade,1.083692,-0.310804,-0.746244,-0.093158,-0.17347,-0.021821,-0.177548,-0.048092,-0.095577,0.0,-0.754521,0.31492,-0.647268,0.727899,-0.613536,1.082082,-0.045957,-0.37445,-0.311649,-0.597938,0.865138,-0.912996,-0.495349,-0.001142,-0.252268,-0.711318,-0.205636,0.435856,-0.241876,-0.472709,-0.485903,-0.444427,-0.768598,0.074697,-0.414111,0
38,She also met with Secretary of Defense James Mattis in Texas.,999001297,39,non-propaganda,1.141804,0.030585,-0.375306,0.413049,-0.659696,-0.663466,-0.701425,-1.00295,0.001906,-0.409571,-0.409571,-0.25546,-0.118878,0.565002,-0.19199,-0.997068,0.327444,-0.040635,-0.541743,-0.201405,-0.776794,-0.375419,-0.735368,-0.708761,-0.760792,5th and 6th grade,-0.20809,-0.246978,-0.746244,-0.093158,-0.17347,0.318761,-0.177548,-0.048092,-0.095577,0.0,0.291914,2.206731,-0.647268,-0.825508,0.481703,-1.182845,-0.045957,-0.37445,0.451038,0.577552,-0.775387,-0.250882,-0.006323,-0.33648,-0.252268,1.866519,-0.205636,-0.102441,0.292436,-0.641742,-0.621021,-0.444427,0.812954,-0.341251,-0.414111,0
7,"In contrast, President Trump issued a statement claiming that “Ramadan reminds us of the richness Muslims add to the religious tapestry of American life.”",759468687,8,propaganda,0.274326,-0.086448,-0.836827,1.476369,-0.308406,0.233283,0.344884,-0.034134,-0.834346,1.007411,1.007411,-0.25546,-0.118878,-1.769904,-0.19199,-0.586532,-0.600813,-0.040635,0.526209,0.642996,0.436417,1.159365,1.298212,0.382598,1.157185,13th and 14th grade,-0.20809,-0.29395,-0.111461,-0.093158,-0.17347,-0.133059,-0.177548,-0.048092,-0.095577,0.0,-0.340024,0.495216,-0.647268,0.031892,-0.613536,0.067277,-0.045957,-0.37445,0.220787,-0.132322,-0.262849,1.119564,-0.366212,0.108384,-0.252268,0.098278,-0.205636,1.02334,-0.41836,0.287942,-0.08055,-0.444427,0.812954,0.333776,-0.414111,1


In [0]:
scaler = StandardScaler()

X_train[['gunning_fog']] = scaler.fit_transform(X_train[['gunning_fog']])
X_val[['gunning_fog']] = scaler.transform(X_val[['gunning_fog']])
X_test[['gunning_fog']] = scaler.transform(X_test[['gunning_fog']])|

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inst

In [0]:
[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'num_words', 'num_chars', 'num_adjs',
       'num_adv', 'sing_pro', 'pl_pro', 'is_question', 'is_exclamation',
       'is_period', 'polarity', 'subjectivity', 'flesch_reading_ease',
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog',
       'digits', 'punctuation_count', 'comma_count',
       'semicolon_count', 'colon_count', 'spaces_count', 'apostrophes_count',
       'amp_count', 'parenthesis_count', 'paragraph_count', 'pronouns',
       'prepositions', 'coordinating_conjunctions', 'adjectives', 'adverbs',
       'determiners', 'interjections', 'modals', 'nouns', 'personal_pronouns',
       'verbs', 'word_len_gte_six', 'word_len_two_and_three',
       'avg_word_length', 'all_caps', 'capitalized', 'quotes_count',
       'sentiment_score', 'normalized_meanings', 'num_unique_words',
       'num_stop_words', 'num_words_upper', 'num_words_title', 'mean_word_len',
       'llc']]

In [0]:
X_train[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'num_words', 'num_chars', 'num_adjs',
       'num_adv', 'sing_pro', 'pl_pro', 'is_question', 'is_exclamation',
       'is_period', 'polarity', 'subjectivity', 'flesch_reading_ease',
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog',
       'digits', 'punctuation_count', 'comma_count',
       'semicolon_count', 'colon_count', 'spaces_count', 'apostrophes_count',
       'amp_count', 'parenthesis_count', 'paragraph_count', 'pronouns',
       'prepositions', 'coordinating_conjunctions', 'adjectives', 'adverbs',
       'determiners', 'interjections', 'modals', 'nouns', 'personal_pronouns',
       'verbs', 'word_len_gte_six', 'word_len_two_and_three',
       'avg_word_length', 'all_caps', 'capitalized', 'quotes_count',
       'sentiment_score', 'normalized_meanings', 'num_unique_words',
       'num_stop_words', 'num_words_upper', 'num_words_title', 'mean_word_len',
       'llc']] = scaler.fit_transform(X_train[['sadness', 'joy',
       'fear', 'disgust', 'anger', 'num_words', 'num_chars', 'num_adjs',
       'num_adv', 'sing_pro', 'pl_pro', 'is_question', 'is_exclamation',
       'is_period', 'polarity', 'subjectivity', 'flesch_reading_ease',
       'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index',
       'automated_readability_index', 'dale_chall_readability_score',
       'difficult_words', 'linsear_write_formula', 'gunning_fog',
       'digits', 'punctuation_count', 'comma_count',
       'semicolon_count', 'colon_count', 'spaces_count', 'apostrophes_count',
       'amp_count', 'parenthesis_count', 'paragraph_count', 'pronouns',
       'prepositions', 'coordinating_conjunctions', 'adjectives', 'adverbs',
       'determiners', 'interjections', 'modals', 'nouns', 'personal_pronouns',
       'verbs', 'word_len_gte_six', 'word_len_two_and_three',
       'avg_word_length', 'all_caps', 'capitalized', 'quotes_count',
       'sentiment_score', 'normalized_meanings', 'num_unique_words',
       'num_stop_words', 'num_words_upper', 'num_words_title', 'mean_word_len',
       'llc']])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
