In [63]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import torch
import html
import re
from transformers import pipeline
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

print('INITIALIZING DATA PROCESSING + CLEANING')

try:
    user_csv = input('Please input the exact name of the CSV file you wish to analyze: ')
    tweet_column = input('Please input the name of the column containing the tweets: ')
    tweet_column_with_quotes = "'" + tweet_column + "'"

    dataframe = pd.read_csv(user_csv, delimiter=',',encoding='utf-8', header = 0)
    pd.set_option('display.max_colwidth', None)
    dataframe.rename(columns={tweet_column:'tweet'}) #renaming the tweet column to 'tweet'

except FileNotFoundError:
    print('There was an error finding the CSV you requested, please check the following:','\n', '1. The CSV file is in the correct directory', '\n', '2. You gave the correct name of the file, following the syntax: yourfilename.csv')


df_copy = dataframe.copy() #creating a copy of the dataframe
df_copy['tweet'] = df_copy['tweet'].str.lower() #making everything lower case
df_copy.drop_duplicates(subset='tweet', keep='first', inplace=True, ignore_index=False) #removing duplicates
df_copy[~df_copy.tweet.str.startswith('rt')] #removing retweets
df_copy['tweet'] = df_copy['tweet'].apply(lambda k: html.unescape(str(k))) #removing unnecessary characters

def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) #Removed mentions
    text = re.sub(r'#', '', text) #Removed hashtags
    text = re.sub(r'https?:\/\/\S+', '', text) #Remove the hyperlink
    text = re.sub(r'\'[\s]+', '', text) #Remove apostrophe
    text = re.sub(r'\...+', '', text) #Remove dots
    text = re.sub(r'\!', '', text) #Remove exclamation  marks

    return text

df_copy['tweet'] = df_copy['tweet'].apply(clean_text)

df_copy.to_csv('Cleaned_Data.csv')

print('\n','DATA PROCESSING + CLEANING COMPLETE', '\n')
print('INITIALIZING DATA ANALYSIS')

print('The model being used is Valhalla, an optimized version of the Bart Large pretrained analysis model providing faster results.', '\n', 'If you have not used it before, it may take a while to download as it is quite large.')
tokenizer = AutoTokenizer.from_pretrained("valhalla/distilbart-mnli-12-1")
model = AutoModelForSequenceClassification.from_pretrained("valhalla/distilbart-mnli-12-1", device = -1)

df_original  = pd.read_csv(r'Cleaned_Data.csv')
rows = df_original['tweet'].count()

try:
    number_of_rows = int(input(f'Please specify the number of rows you wish to analyze, in your current dataset, there are {rows} rows of tweets, if you wish to look at them all, input any non-numerical character'))
    df_name = df_original.head(number_of_rows)
except ValueError:
    df_name = df_original

labels = input('Please specify the lables you wish to analyze the tweets with, type "done" once you have inputted the ones you wish to use')

candidate_labels = []
candidate_results = []

while labels != 'done':
    candidate_labels.append(labels)
    labels = input('Please specify the lables you wish to analyze the tweets with, type "done" once you have inputted the ones you wish to use')

print(f'{candidate_labels}')
approval = input('Are these labels correct? if so, type "y", if not, type "n"')

if approval == 'y':
    print(f'These are the labels which will be used: {candidate_labels}')
elif approval == 'n':
    print(f'The labels entered were incorrect or incomplete, this process will now start over')
    candidate_labels = []
    candidate_results = []
    labels = input('Please specify the lables you wish to analyze the tweets with, type "done" once you have inputted the ones you wish to use')
    while labels != 'done':
        candidate_labels.append(labels)
        labels = input('Please specify the lables you wish to analyze the tweets with, type "done" once you have inputted the ones you wish to use')
        print(f'{candidate_labels}')
        approval = input('Are these labels correct? if so, type "y", if not, type "n"')
else:
    print('You have inputted something other than "y" or "n", please input one of these')
    print(f'{candidate_labels}')
    approval = input('Are these labels correct? if so, type "y", if not, type "n"')

for x in range(len(candidate_labels)):
    candidate_results.append(0)

print(candidate_results)


""" candidate_labels = ['racist', 'sexist', 'hatespeech', 'neutral', 'offensive']
candidate_results = [0, 0, 0, 0, 0]

for sent in tqdm(df_name['tweet'].values):
        
    res = classifier(sent, candidate_labels, multi_class = False) #change multiclass to True for different results

    if res['labels'][0] == 'racist' and res['scores'][0] > 0.5:
        candidate_results[0] = candidate_results[0] + 1
    if res['labels'][0] == 'sexist' and res['scores'][0] > 0.5:
        candidate_results[1] = candidate_results[1] + 1
    if res['labels'][0] == 'hatespeech' and res['scores'][0] > 0.5:
        candidate_results[2] = candidate_results[2] + 1
    if res['labels'][0] == 'neutral' and res['scores'][0] > 0.5:
        candidate_results[3] = candidate_results[3] + 1
    if res['labels'][0] == 'offensive' and res['scores'][0] > 0.5:
        candidate_results[4] = candidate_results[4] + 1

    if res['scores'][0] > 0.5:
        print(sent)
        print(res['labels'])
        print(res['scores'])
        print('\n')

print(candidate_results) """

INITIALIZING DATA PROCESSING + CLEANING

 DATA PROCESSING + CLEANING COMPLETE 

INITIALIZING DATA ANALYSIS
The model being used is Valhalla, an optimized version of the Bart Large pretrained analysis model providing faster results. 
 If you have not used it before, it may take a while to download as it is quite large.
['racist', 'sexist', 'hatespeech']
The labels entered were incorrect or incomplete, this process will now start over


" candidate_labels = ['racist', 'sexist', 'hatespeech', 'neutral', 'offensive']\ncandidate_results = [0, 0, 0, 0, 0]\n\nfor sent in tqdm(df_name['tweet'].values):\n        \n    res = classifier(sent, candidate_labels, multi_class = False) #change multiclass to True for different results\n\n    if res['labels'][0] == 'racist' and res['scores'][0] > 0.5:\n        candidate_results[0] = candidate_results[0] + 1\n    if res['labels'][0] == 'sexist' and res['scores'][0] > 0.5:\n        candidate_results[1] = candidate_results[1] + 1\n    if res['labels'][0] == 'hatespeech' and res['scores'][0] > 0.5:\n        candidate_results[2] = candidate_results[2] + 1\n    if res['labels'][0] == 'neutral' and res['scores'][0] > 0.5:\n        candidate_results[3] = candidate_results[3] + 1\n    if res['labels'][0] == 'offensive' and res['scores'][0] > 0.5:\n        candidate_results[4] = candidate_results[4] + 1\n\n    if res['scores'][0] > 0.5:\n        print(sent)\n        print(res['labels'])\n     

In [None]:
24740