# Feature engineering

## Import packages

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats import weightstats as stests
from collections import Counter
from ipynb.fs.full.bespoke_functions import plot_comparison_histogram, get_number_capitals

## Read in datasets

In [3]:
real_df = pd.read_csv('~/documents/Data/Fake vs Real News/real_clean.csv')
fake_df = pd.read_csv('~/documents/Data/Fake vs Real News/fake_clean.csv')

## Length of title

I hypothesise that there is likely to be a relationship between the number of characters in the title of an article and the article's verisimilitude.

In [4]:
real_df['length_title'] = [len(x) for x in real_df['title']]
fake_df['length_title'] = [len(x) for x in fake_df['title']]

#### Histogram

In [5]:
plot_comparison_histogram(real_df, fake_df, 'length_title',
                          title = 'Histogram of number of characters in the title of a news article',
                          xlabel = 'Number of characters')

NameError: name 'plot_comparison_histogram' is not defined

By visual inspection, the histogram shows there is likely to be a relationship between the number of characters in the title of the article and the article's verisimilitude, i.e. that an article with a longer title is more likely to be fake news.

#### Independent sample t-test

In [None]:
t, p = stats.ttest_ind(real_df['length_title'], fake_df['length_title'])
print("t = " + str(t))
print("p = " + str(p))

The p value approximates to zero, suggesting there is strong evidence that the means of these two samples are different, which supports the hypothesis.

## Length of body of article

I hypothesise that there is likely to be a relationship between the number of characters in the body of an article and the article's verisimilitude.

In [None]:
real_df['length_text'] = [len(x) for x in real_df['text']]
fake_df['length_text'] = [len(x) for x in fake_df['text']]

#### Histogram

In [None]:
plot_comparison_histogram(real_df, fake_df, 'length_text',
                          title = 'Histogram of number of characters in the body of a news article',
                          xlabel = 'Number of characters',
                         bins = 200, xlim = 7500)

By visual inspection, the histogram shows there may be a relationship between the number of characters in the body of an article and the article's verisimilitude, but this is a complex non-linear relationship that may be a result of structural differences in the data. I will therefore remove this column to reduce the dimensionality of the problem and avoid overfitting to the data.

In [None]:
real_df = real_df.drop('length_text', axis = 1)
fake_df = fake_df.drop('length_text', axis = 1)

## Number of capital letters in title

I hypothesise that there is a relationship between the number or proportion of capital letters in an article's title and the article's verisimilitude.

In [None]:
def get_number_capitals(input_array, num_or_prop):
    'Takes a list of string objects and returns a list of the number or proportion of capital letters in each string.'
    output_array = []
    
    for i in range(len(input_array)):
        number_capitals = sum([x.isupper() for x in input_array.iloc[i]])
        
        if num_or_prop == 'num':
            output_array.append(number_capitals)
        elif num_or_prop == 'prop':
            proportion_capitals = number_capitals / len(input_array.iloc[i])
            output_array.append(proportion_capitals)
    
    return output_array

In [None]:
real_df['number_capitals_title'] = get_number_capitals(real_df['title'], num_or_prop = 'num')
fake_df['number_capitals_title'] = get_number_capitals(fake_df['title'], num_or_prop = 'num')

In [None]:
plot_comparison_histogram(real_df, fake_df, 'number_capitals_title',
                          title = 'Histogram of number of capital letters in the title of a news article',
                          xlabel = 'Number of capitals',
                         xlim = 80)

#### Independent sample t-test

In [None]:
t, p = stats.ttest_ind(real_df['number_capitals_title'], fake_df['number_capitals_title'])
print("t = " + str(t))
print("p = " + str(p))

The p value approximates to zero, suggesting there is strong evidence that the means of these two samples are different, which supports the hypothesis.

In [None]:
real_df['proportion_capitals_title'] = get_number_capitals(real_df['title'], num_or_prop = 'prop')
fake_df['proportion_capitals_title'] = get_number_capitals(fake_df['title'], num_or_prop = 'prop')

#### Histogram

In [None]:
plot_comparison_histogram(real_df, fake_df, 'proportion_capitals_title',
                          title = 'Histogram of proportion of capital letters in the title of a news article',
                          xlabel = 'Number of capitals',
                         xlim = 1)

In [None]:
t, p = stats.ttest_ind(real_df['proportion_capitals_title'], fake_df['proportion_capitals_title'])
print("t = " + str(t))
print("p = " + str(p))

The p value approximates to zero, suggesting there is strong evidence that the means of these two samples are different, which supports the hypothesis.

The proportion_capitals_title and number_capitals_title variables appear to have a high degree of colinearity. The t statistic for proportion_capitals_title is slightly lower, so I will remove number_capitals_title in order to reduce the dimensionality of the problem.

In [None]:
real_df = real_df.drop('number_capitals_title', axis = 1)
fake_df = fake_df.drop('number_capitals_title', axis = 1)

## Average word length


I hypothesise that there is a relationship between the number or proportion of capital letters in an article's title and the article's verisimilitude.

In [None]:
def get_avg_word_length(input_array):
    output_array = []
    
    number_of_words = [1 if len(x.split()) == 0 else len(x.split()) for x in input_array]
    number_of_characters = [len(x) for x in input_array]
    avg_word_length = [x / y for x, y in zip(number_of_characters, number_of_words)]
        
    return avg_word_length

### Average word length in title

In [None]:
real_df['avg_word_length_title'] = get_avg_word_length(real_df['title'])
fake_df['avg_word_length_title'] = get_avg_word_length(fake_df['title'])

In [None]:
plot_comparison_histogram(real_df, fake_df, 'avg_word_length_title',
                          title = 'Histogram of average word length the title of a news article',
                          xlabel = 'Average word length')

### Average word length in body of article

In [None]:
real_df['avg_word_length_text'] = get_avg_word_length(real_df['text'])
fake_df['avg_word_length_text'] = get_avg_word_length(fake_df['text'])

In [None]:
plot_comparison_histogram(real_df, fake_df, 'avg_word_length_text',
                          title = 'Histogram of average word length the body of a news article',
                          xlabel = 'Average word length', bins = 500, xlim = 10)

It does not appear that there is a relationship between the average word length of either the title or body of an article and its verisimilitude.

## Most common words

In [46]:
def get_most_common_words(input_array, n):
    titles = np.array([x.split() for x in input_array])
    words = [word.upper() for sublist in titles for word in sublist]
    most_common_words = Counter(words).most_common(n)
    return([x[0] for x in most_common_words])

get_most_common_words(real_df['title'], 10)

TypeError: 'Counter' object is not callable

In [44]:
get_most_common_words(real_df['title'], 100)

TypeError: 'Counter' object is not callable

In [None]:
[x[0] for x in most_occur]