In [None]:
from pathlib import Path
import re
import contractions
import emot
import pandas as pd
import numpy as np
import matplotlib.pyplot as pp
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer

## ACL-14-Short: Brief Description
Exclusively tweets.

The raw data is in a text file and spread over groups of 3 lines such that for $i \in [0,nlines/3]$:
* line i * 3 --> contains the context
* line i * 3 + 1 --> contains the sentiment target
* line i * 3 + 2 --> contains the sentiment polarity

Data cleaning plan:
* remove duplicate columns
* remove unnecessary references (http, @mentions, etc., and empty spaces using regex)
* clean up contractions (using the contractions library)
* clean up emojis, replace encodings by plain english description (using emot)
* all text is lowercased
* polarities are converted to int.

Note that some of these steps have already been taken for ACL-14 and are therefore redundant. However the functions developed will be used to develop a standard ETL pipeline developed and will be applied regardless of a particular data set's preprocessing history. This will insure uniformity across multiple data sets.

At the end of this notebook train- and test dataframes will be stored in acl-14's "clean" directory as df_train and df_test. They will contain the following columns.
* context: left as is;
* preprocessed context: context preprocessed follwoign steps listed above;
* target: processed as above, where applicable;
* polarity: ternary label, stored as integer.

## Load Data

In [None]:
proj_path = Path('/home/ekarakoylu/PROJEX/directed_sentiment_nli')
acl_data_path = proj_path /'data/acl-14-short'
sentfin_dp = proj_path / 'data/sentfin'
newsmtsc_dp = proj_path /'data/newsmtsc'

In [None]:
def load_data_2_dframe(path: Path | str):
    """
    This functions parses data files in the ACL-14-Short format. 
        Input: 
            [pathlib.Path or str] data path
        Output:
            DataFrame with columns 'context' (str), 'target'(str), 'polarity'(int).

    """
    try:
        with open(path) as f:
            flines = f.read().splitlines()
        fdict = dict(
            context=flines[::3],
            target=flines[1::3],
            polarity=flines[2::3])
        df = pd.DataFrame(fdict)
        df['polarity'] = pd.to_numeric(df.polarity)
        return df
    except FileNotFoundError as err:
        # logging statement?
        raise err
    

In [None]:
df_train = load_data_2_dframe(acl_data_path/'raw/train.raw')
df_test = load_data_2_dframe(acl_data_path/'raw/test.raw')

In [None]:
df_train.info()

In [None]:
df_test.info()

## Cleaning:

Steps involved are:
1. Removing unnecessary characters (including spurious blank spaces, references, etc.)
2. Clean contractions
3. Replace emojis with their textual equivalent.

In [None]:
def clean_emoticons(text):
    emot_obj = emot.core.emot() 
    res = emot_obj.emoji(text)
    locations = reversed(res['location']) 
    replace_with = reversed(res['mean'])
    for loc, rep in zip(locations, replace_with):
        text = text[:loc[0]] + rep + text[loc[1]:]

    return text
    

In [None]:
def remove_unnecessary(text):
    # Remove links
    text = re.sub('http://\S+|https://\S+', '', text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r"http\S+", "", text)
    # Convert HTML references
    text = re.sub('&amp', 'and', text)
    text = re.sub('&lt', '<', text)
    text = re.sub('&gt', '>', text)
    # Remove new line characters
    text = re.sub('[\r\n]+', ' ', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove multiple space characters
    text = re.sub('\s+',' ', text)
    # Convert to lowercase
    text = text.lower()
    return text

In [None]:
def clean_contractions(text):
    try:
        return contractions.fix(text)
    except:
        return text

### EDA
This section features EDA steps for the ACL-14 dataset. Specific steps are as follows
1. Most common bigrams and trigrams to detect possible spurious/problematic patterns
2. General sentiment (inferred using Vader and TextBlob) and comparisons to target polarities.
3. Class distributions

In [None]:
def get_top_k_ngrams(corpus, k=None, ngram=2):
    vec = CountVectorizer(ngram_range=(ngram, ngram), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

    d_wf = pd.DataFrame(words_freq, columns = ['TweetText' , 'count'])
    d_wf.groupby('TweetText').sum()['count'].sort_values(ascending=False).iplot(
        kind='bar',
        yTitle='Count',
        linecolor='black',
        title='Top 20 bigrams in Tweet before removing spams')
    return words_freq[:k], d_wf

In [None]:
#d_bigram = get_top_k_ngrams(df.processed_context, k=20)
#d_trigram = get_top_k_ngrams(df.processed_context, k=20, ngram=3)