In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('cyberbullying_tweets.csv')
df
df = df.rename(columns={'tweet_text': 'text', 'cyberbullying_type': 'sentiment'})
df.head()
# Clean emojis from text
def strip_emoji(text):
    return emoji.get_emoji_regexp().sub("", text)

# Remove punctuations, stopwords, links, mentions and new line characters
def strip_all_entities(text):
    text = re.sub(r'\r|\n', ' ', text.lower())  # Replace newline and carriage return with space, and convert to lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)  # Remove links and mentions
    text = re.sub(r'[^\x00-\x7f]', '', text)  # Remove non-ASCII characters
    banned_list = string.punctuation
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    # Remove hashtags at the end of the sentence
    new_tweet = re.sub(r'(\s+#[\w-]+)+\s*$', '', tweet).strip()

    # Remove the # symbol from hashtags in the middle of the sentence
    new_tweet = re.sub(r'#([\w-]+)', r'\1', new_tweet).strip()

    return new_tweet

# Filter special characters such as & and $ present in some words
def filter_chars(text):
    return ' '.join('' if ('$' in word) or ('&' in word) else word for word in text.split())

# Remove multiple spaces
def remove_mult_spaces(text):
    return re.sub(r"\s\s+", " ", text)

# Function to check if the text is in English, and return an empty string if it's not
def filter_non_english(text):
    try:
        lang = detect(text)
    except LangDetectException:
        lang = "unknown"
    return text if lang == "en" else ""

# Expand contractions
def expand_contractions(text):
    return contractions.fix(text)

# Remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Lemmatize words
def lemmatize(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Remove short words
def remove_short_words(text, min_len=2):
    words = text.split()
    long_words = [word for word in words if len(word) >= min_len]
    return ' '.join(long_words)

# Replace elongated words with their base form
def replace_elongated_words(text):
    regex_pattern = r'\b(\w+)((\w)\3{2,})(\w*)\b'
    return re.sub(regex_pattern, r'\1\3\4', text)

# Remove repeated punctuation
def remove_repeated_punctuation(text):
    return re.sub(r'[\?\.\!]+(?=[\?\.\!])', '', text)

# Remove extra whitespace
def remove_extra_whitespace(text):
    return ' '.join(text.split())

def remove_url_shorteners(text):
    return re.sub(r'(?:http[s]?://)?(?:www\.)?(?:bit\.ly|goo\.gl|t\.co|tinyurl\.com|tr\.im|is\.gd|cli\.gs|u\.nu|url\.ie|tiny\.cc|alturl\.com|ow\.ly|bit\.do|adoro\.to)\S+', '', text)

# Remove spaces at the beginning and end of the tweet
def remove_spaces_tweets(tweet):
    return tweet.strip()

# Remove short tweets
def remove_short_tweets(tweet, min_words=3):
    words = tweet.split()
    return tweet if len(words) >= min_words else ""

# Function to call all the cleaning functions in the correct order
def clean_tweet(tweet):
    tweet = strip_emoji(tweet)
    tweet = expand_contractions(tweet)
    tweet = filter_non_english(tweet)
    tweet = strip_all_entities(tweet)
    tweet = clean_hashtags(tweet)
    tweet = filter_chars(tweet)
    tweet = remove_mult_spaces(tweet)
    tweet = remove_numbers(tweet)
    tweet = lemmatize(tweet)
    tweet = remove_short_words(tweet)
    tweet = replace_elongated_words(tweet)
    tweet = remove_repeated_punctuation(tweet)
    tweet = remove_extra_whitespace(tweet)
    tweet = remove_url_shorteners(tweet)
    tweet = remove_spaces_tweets(tweet)
    tweet = remove_short_tweets(tweet)
    tweet = ' '.join(tweet.split())  # Remove multiple spaces between words
    return tweet
df['text_clean'] = [clean_tweet(tweet) for tweet in df['text']]
df.head()
print(f'There are around {int(df["text_clean"].duplicated().sum())} duplicated tweets, we will remove them.')
df.drop_duplicates("text_clean", inplace=True)
df.sentiment.value_counts()
df = df[df["sentiment"]!="other_cyberbullying"]
df['text_len'] = [len(text.split()) for text in df.text_clean]
df.sentiment.value_counts()
df['sentiment'] = df['sentiment'].replace({'religion':0,'age':1,'ethnicity':2,'gender':3,'not_cyberbullying':4})
df_label_0 = df[df['sentiment'] == 0].sort_values(by='text_len', ascending=False)
df_label_1 = df[df['sentiment'] == 1].sort_values(by='text_len', ascending=False)
df_label_2 = df[df['sentiment'] == 2].sort_values(by='text_len', ascending=False)
df_label_3 = df[df['sentiment'] == 3].sort_values(by='text_len', ascending=False)
df_label_4 = df[df['sentiment'] == 4].sort_values(by='text_len', ascending=False)

# Determine how many rows to drop from each (half)
n_to_drop_0 = 7164
n_to_drop_1 = 7068
n_to_drop_2 = 6679
n_to_drop_3 = 6643
n_to_drop_4 = 3077

# Drop rows from class 0
df_label_0_dropped = df_label_0.iloc[n_to_drop_0:].drop(columns=['text_len'])

# Drop rows from class 1
df_label_1_dropped = df_label_1.iloc[n_to_drop_1:].drop(columns=['text_len'])

df_label_2_dropped = df_label_0.iloc[n_to_drop_2:].drop(columns=['text_len'])

df_label_3_dropped = df_label_0.iloc[n_to_drop_3:].drop(columns=['text_len'])

df_label_4_dropped = df_label_0.iloc[n_to_drop_4:].drop(columns=['text_len'])

# Concatenate the two DataFrames back into a single DataFrame
df = pd.concat([df_label_0_dropped, df_label_1_dropped, df_label_2_dropped, df_label_3_dropped, df_label_4_dropped]).reset_index(drop=True)
df['sentiment'] = df['sentiment'].replace({'religion':1,'age':0,'ethnicity':0,'gender':0,'not_cyberbullying':1})
df = df[df['text_len'] < df['text_len'].quantile(0.995)]
max_len = np.max(df['text_len'])
max_len