# Bespoke functions

## Import packages

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats import weightstats as stests
from collections import Counter

In [1]:
def plot_comparison_histogram(real_df, fake_df, variable_to_plot, title, xlabel, bins = None, xlim = None):
    fig, ax = plt.subplots()
    sns.distplot(real_df[variable_to_plot], bins = bins, kde = False)
    sns.distplot(fake_df[variable_to_plot], bins = bins, kde = False)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel('Count')
    ax.legend(['Real news', 'Fake news'])
    plt.xlim(left = None, right = xlim)
    plt.show()

In [5]:
def get_number_capitals(input_array, num_or_prop):
    'Takes a list of string objects and returns a list of the number or proportion of capital letters in each string.'
    output_array = []
    
    for i in range(len(input_array)):
        number_capitals = sum([x.isupper() for x in input_array.iloc[i]])
        
        if num_or_prop == 'num':
            output_array.append(number_capitals)
        elif num_or_prop == 'prop':
            proportion_capitals = number_capitals / len(input_array.iloc[i])
            output_array.append(proportion_capitals)
    
    return output_array

In [6]:
def get_most_common_words(input_array, n):
    titles = np.array([x.split() for x in input_array])
    words = [word.upper() for sublist in titles for word in sublist]
    most_common_words = Counter(words).most_common(n)
    return([x[0] for x in most_common_words])

In [7]:
def get_avg_word_length(input_array):
    output_array = []
    
    number_of_words = [1 if len(x.split()) == 0 else len(x.split()) for x in input_array]
    number_of_characters = [len(x) for x in input_array]
    avg_word_length = [x / y for x, y in zip(number_of_characters, number_of_words)]
        
    return avg_word_length