# PJE B - Data Analysis

## Data cleaning

For each tweet that we want to analyze, we need to clean the tweet to follow the rules and assist our analysis.

### Rules

- Remove mentions:
    - old: `@user how are you ?`
    - new: `how are you ?`

- Remove hashtags:
    - old: `This is so cool #music`
    - new: `This is so cool`

- Remove retweets:

- Remove link:
    - old: `I love this castle http://castle.com`
    - new: `I love this castle`

- Remove integrated link:
    - old: `Check this out - http://link.com`
    - new: `None`

- Remove happy and sad emoticons in the same tweet:
    - old: `I love this new music :) but there is no tourney soon :(`
    - new: `None`

- Add space before punctutation only if there's a letter before
    - old: `Hello!`
    - new: `Hello !`

In [7]:
from csv import reader
from os import getcwd
from os.path import join
from re import sub

def clean_data(data: str) -> str:
    patterns = [
        (r"@[a-zA-Z0-9]+", ""),                                                                         # Remove mentions
        (r"#[a-zA-Z0-9]+", ""),                                                                         # Remove hashtags
        (r"RT", ""),                                                                                    # Remove retweets
        (r".+ - http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", ""), # Remove attached links
        (r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", ""),      # Remove links
        (r".*[:;][\)][^\n]*[:;][\(].*|.*[:;][\(][^\n]*[:;][\)].*", ""),                                 # Remove happy and sad emoticons in the same tweet
        (r"(?<=[a-zA-Z])[!\?\"\.;,]", r" \g<0>"),                                                       # Add space before punctuation only if there's a letter before
        (r"[!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]", ""),                                                 # Remove punctuation
        # (r"[^\w\s]", ""),                                                                               # Remove all punctuation except word characters and whitespace
        (r"", "")                                                                                       # Remove any remaining whitespace
    ]

    for pattern in patterns:
        if data == "":
            break

        data = sub(pattern[0], pattern[1], data)

    return data.lower()

#### Cleaning Emojis
- emojibase Json file containing emoji Unicode and groupings (e.g., positive face, negative face).
- Removing tweets from the dataset that contain opposite emoji sentiments.

In [8]:
import json
import re

def check_emoji(emojis):
    with open(r"datasets\emojibase.json",'r') as f:
        js_data=json.load(f)
    faces = ['face positive','face neutral','face negative']
    dict_emoji={}
    for row in js_data:
        if row['category']=='smileys and people' and row['group'] in faces:
            dict_emoji[row['unicode'][0]]=row['group']
    if emojis[0]!=emojis[1]:
        return True
    return False


def clean_emojis(df):
    pattern = r'U\+[A-F0-9]{4}'
    for ind,row in df.iterrows():
        match = re.findall(pattern,row['text']) 
        if len(match)>1:
            if check_emoji(match):
                df = df[df['id']!=row['id']]
    return df

    

## Labelisation of the tweets

In [9]:
DEBUG: bool = False

def parse_keywords(data: str, positive_words: set, negative_words: set) -> tuple:
    positive: int = 0
    negative: int = 0

    for word in data.split(' '):
        if word in positive_words:
            positive += 1
        elif word in negative_words:
            negative += 1

    return (positive, negative)

if __name__ == "__main__":
    with open(join(getcwd(), "datasets/inputs/testdata.manual.2009.06.14.csv"), "r") as f, open(join(getcwd(), "datasets/output/cleaned_data.csv"), "w") as o:
        cleaned_data: set = set()

        for row in reader(f):
            data: str = clean_data(row[5])

            if data in cleaned_data:
                continue

            cleaned_data.add(data)

            if data != "":
                o.write(','.join(map(lambda x : f"\"{x}\"", row[:5])) + ',\"' + data.replace("\"", "\"\"") + "\"\n")

    with open(join(getcwd(), "datasets/output/cleaned_data.csv"), "r") as f, open(join(getcwd(), "datasets/inputs/positive.txt"), "r") as p, open(join(getcwd(), "datasets/inputs/negative.txt"), "r") as n:
        positive_words: set = set()
        negative_words: set = set()

        for line in p.readlines():
            for word in line.split(','):
                w = word.strip()

                if w != "":
                    positive_words.add(w)

        for line in n.readlines():
            for word in line.split(','):
                w = word.strip()

                if w != "":
                    negative_words.add(w)

        p, n, nt = 0, 0, 0
        k, l = 0, 0
        pn = 0 # Found positive but it's negative
        pnt = 0 # Found positive and it's neutral
        np = 0 # Found negative but it's positive
        nnt = 0 # Found negative and it's neutral
        ntp = 0 # Found neutral and it's positive
        ntn = 0 # Found neutral and it's negative

        for row in reader(f):
            positive, negative = parse_keywords(row[5], positive_words, negative_words)

            if positive > negative:
                if row[0] != '4':
                    if DEBUG:
                        print(f"Error in row {row[0]}: {row[5]} ({positive} positive, {negative} negative)")

                    if row[0] == '0':
                        pn += 1
                    elif row[0] == '2':
                        pnt += 1

                    k += 1
                else:
                    p += 1

            elif negative > positive:
                if row[0] != '0':
                    if DEBUG:
                        print(f"Error in row {row[0]}: {row[5]} ({positive} positive, {negative} negative)")

                    if row[0] == '4':
                        np += 1
                    elif row[0] == '2':
                        nnt += 1

                    k += 1
                else:
                    n += 1

            else:
                if row[0] != '2':
                    if DEBUG:
                        print(f"Error in row {row[0]}: {row[5]} ({positive} positive, {negative} negative)")

                    if row[0] == '4':
                        ntp += 1
                    elif row[0] == '0':
                        ntn += 1

                    k += 1
                else:
                    nt += 1

            l += 1

        # 53.91% accuracy
        # 58.02% accuracy
        accuracy = ((l - k) / l) * 100

        print(f"Errors: {k}/{l}")
        print(f"Positive: {p}")
        print(f"Positive but negative: {pn}")
        print(f"Positive but neutral: {pnt}")
        print(f"Wrong positive: {pn + pnt}\n")
        print(f"Negative: {n}")
        print(f"Negative but positive: {np}")
        print(f"Negative but neutral: {nnt}")
        print(f"Wrong negative: {np + nnt}\n")
        print(f"Neutral: {nt}")
        print(f"Neutral but positive: {ntp}")
        print(f"Neutral but negative: {ntn}")
        print(f"Wrong neutral: {ntp + ntn}\n")
        print(f"Accuracy: {accuracy:.2f}%")

Errors: 204/486
Positive: 111
Positive but negative: 26
Positive but neutral: 12
Wrong positive: 38

Negative: 55
Negative but positive: 1
Negative but neutral: 3
Wrong negative: 4

Neutral: 116
Neutral but positive: 68
Neutral but negative: 94
Wrong neutral: 162

Accuracy: 58.02%
