<a href="https://colab.research.google.com/github/YujiaoZhao/Carla_Pull/blob/main/AllFeatures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#importing in libraries
import pandas as pd
import numpy as np
import regex as re
import string
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from scipy import stats as st
import json
import re
from nltk.tokenize.regexp import WhitespaceTokenizer
from nltk import tokenize
from google.colab import drive

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Function for Count of Numbers

This function will return a list with the count of numerical values for each String in QText.

In [None]:
def number_count(texts):
    """ Input: Series of Strings
        Output: List with number of numerical values in the text
        The function accounts for the fact that there is 
        sometimes commas and decimals in numbers"""
    number_counts = []
    for i in texts:
        text_wo_punct = i.replace(",", "").replace(".", "")
        number_counts.append(len(re.findall("[0-9]+", text_wo_punct)))
    return number_counts

# Function for Count of Numbers and Number Words

Number words accounts for written out numbers that include hundred, thousand, million, billion, trillion, or dozen.

In [None]:
def number_and_numwords_count(texts):
    """ Input: Series of strings 
        Output: List of the counts of numerical values and 
        number words in each String in the Series.
        The function accounts for the fact that there is 
        sometimes commas and decimals in numbers """
    number_counts = []
    for i in texts:
        
        text_clean = i.replace(",", "").replace(".", "").lower()
        num_words = len(re.findall(r"\bhundreds?|thousands?|millions?|billions?|trillions?|dozens?\b", text_clean))
        nums = len(re.findall("[0-9]+", text_clean))
        number_counts.append(num_words + nums)
    
    
    return number_counts

# Has Numerical Value Function
This function will take in a the "QText" column and return an array with 1 for Quotes which contain any type of numerical value (number or number word) and 0 otherwise.

In [None]:
def has_number(texts):
    """ Input: Series of strings (QText column)
        Output: List of 1s and 0s. 1 means that the 
        string contained a number or number word.
        The function accounts for the fact that there is 
        sometimes commas and decimals in numbers """
    number_counts = []
    for i in texts:
        
        text_clean = i.replace(",", "").replace(".", "").lower()
        num_words = len(re.findall(r"\bhundreds?\b|\bthousands?\b|\bmillions?\b|\bbillions?\b|\btrillions?\b|\bdozens?\b", text_clean))
        nums = len(re.findall("[0-9]+", text_clean))
        number_counts.append(num_words + nums)
    has_num = []
    for x in number_counts:
        if x > 0:
            has_num.append(1)
        else:
            has_num.append(0)
            
    return has_num

# Function for Count of Group Words
When I say "group words," I am referring to the words: us, our, ours, and we.

In [None]:
def group_wrd_count(texts):
    """ Input: Series of strings (QText column)
        Output: List of count of group words in 
        each string.
        """
    word_counts = []
    for text in texts:
        text_lowercase = text.lower()
        text_clean = re.sub(r"[^\w\s]", " ",text_lowercase)
        words_count = len(re.findall(r"\bus\b|\bours?\b|\bwe\b", text_clean)) 
        word_counts.append(words_count)
    return word_counts


# Function for Frequency of Group Words

In [None]:
def group_wrd_freq(texts):
    """ Input: Series of strings (QText column)
        Output: List of frequency of group words in 
        each string (# of "group words" / total # of words).
        """
    word_counts = []
    quote_lengths = []
    
    for text in texts:
        text_lowercase = text.lower()
        text_clean = re.sub(r"[^\w\s]", " ",text_lowercase)
        words_count = len(re.findall(r"\bus\b|\bours?\b|\bwe\b", text_clean)) 
        word_counts.append(words_count)
        if type(text) == float:
            quote_lengths.append(0)
        else:
            quote_lengths.append(len(text.split()))
        
    return list(np.array(word_counts) / np.array(quote_lengths))

# Function for Has Group Word
This function checks if there is at least one group word in each of a Series of Strings.

In [None]:
def has_group_word(texts):
    """ Input: Series of strings (QText column)
        Output: List of 1s and 0s. 1 means that the 
        string contained a group word.
        """
    group_word_counts = []
    for text in texts:
        text_lowercase = text.lower()
        text_clean = re.sub(r"[^\w\s]", " ",text_lowercase)
        words_count = len(re.findall(r"\bus\b|\bours?\b|\bwe\b", text_clean)) 
        group_word_counts.append(words_count)
        
    has_group_word = []
    for count in group_word_counts:
        if count > 0:
            has_group_word.append(1)
        else:
            has_group_word.append(0)
            
    return has_group_word

# Sentiment Feature Function

In [None]:
vader = SentimentIntensityAnalyzer()

def sentence_vader(string):
    sentence_list = tokenize.sent_tokenize(string)
    score_list = []
    for sentence in sentence_list:
        score = vader.polarity_scores(sentence)
        score_list.append(score['compound'])
    return np.mean(score_list)

In [None]:
def vader_15(string):
    split_string = string.split(" ")
    length = len(split_string)
    sentence_list = []
    for i in range(length//15):
        sentence = " ".join(split_string[i*15:(i+1)*15])
        sentence_list.append(sentence)
    if length % 15 != 0:
        digit = length%15
        remainder = " ".join(split_string[-digit:])
        sentence_list.append(remainder)
    
    score_list = []
    for sentence in sentence_list:
        score = vader.polarity_scores(sentence)
        score_list.append(score['compound'])
    return np.mean(score_list)

# Subjectivity Feature Function

In [None]:
#Loading the MPQA Subjectivity Clues Lexicon (https://mpqa.cs.pitt.edu/lexicons/subj_lexicon/)
drive.mount('/content/drive')
file_path = 'drive/My Drive/mpqa_dict.json'
mpqa_dict = json.load(open(file_path, "r"))

def get_subjectivity(art_str, mpqa_dict):
    subjectivity, adjectives, sub_adjectives = 0, 0, 0
    words = re.sub("[^\w]", " ",  art_str).split()
    quote_len = len(words)
    
    for w in words:
        if w in mpqa_dict:  
            if mpqa_dict[w]['subj'] == 'weaksubj':
                subjectivity += 0.1
            if mpqa_dict[w]['subj'] == 'strongsubj':
                subjectivity += 1
            if mpqa_dict[w]['pos'] == 'adj' :
                adjectives += 1
                if mpqa_dict[w]['subj'] == 'weaksubj':
                    sub_adjectives += 0.1
                if mpqa_dict[w]['subj'] == 'strongsubj':
                    sub_adjectives += 1
    
    subjectivity_final = subjectivity/quote_len*100
    sub_adjectives_final = sub_adjectives/quote_len*100
    
    return subjectivity_final, sub_adjectives_final

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Has Nuke Function

In [None]:
# Indicates if the word 'nuke' was used in the text using regex
def has_nuke(text):
    if type(text) == str:
        return int(len(re.findall("(nukes?[^\w])", text)) > 0)
    else:
        return 0

### Count British Words Function

In [None]:
# Imports british to american word dictionary and function counts number of british words in the text
import requests
url ="https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/british_spellings.json"
british_to_american_dict = requests.get(url).json()
british_words = list(british_to_american_dict.keys())
remove_words = ['buses', 'disc', 'dialogue', 'gramme']
british_words_edited = list(british_to_american_dict.keys())
[british_words_edited.remove(w) for w in remove_words]

def count_british_edited(text):
    count = 0
    if type(text) == str:
        for i in british_words_edited:
            count += text.count(i + " ")
            count += text.count(i + ".")
            count += text.count(i + "?")
            count += text.count(i + ",")
            count += text.count(i + "-")
            count += text.count(i + ":")
    return count