## Import Libraries for Text Processing
Load the necessary libraries for natural language processing, including NLTK for text manipulation and TextBlob for text normalization.

In [108]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Ensure all necessary libraries and resources are installed and imported
try:
    from textblob import Word
except ImportError:
    !pip install textblob
    from textblob import Word

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

## Load Dataframes
Load multiple CSV files containing raw tweet data.

In [109]:
df = pd.read_csv('twitter_raw_new.csv')
df2 = pd.read_csv('twitter_raw_new_2.csv')
df3 = pd.read_csv('twitter_raw_new_3.csv')
df4 = pd.read_csv('twitter_raw_new_4.csv')
df_b1 = pd.read_csv('twitter_raw_new_before_1.csv')

## Concatenate Dataframes
Combine multiple dataframes into a single dataframe for unified processing.

In [110]:
# gabungkan semua data dengan concat
df = pd.concat([df, df2, df3, df4, df_b1], ignore_index=True)

## Select Relevant Columns
Focus on specific columns relevant to the sentiment analysis.

In [111]:
df = df[['username', 'text', 'date', 'comments', 'retweets', 'quotes', 'likes']]

## Clean and Parse Dates
Normalize date format to ensure consistency and facilitate time-based analysis.

In [112]:
# The date format in the CSV seems to include a '·' character before the time, let's remove it to aid in parsing.
df['date'] = df['date'].str.replace('·', '')

# Next, convert the 'date' column to datetime format, assuming the time is already in UTC
df['date'] = pd.to_datetime(df['date'], utc=True, format='%b %d, %Y %I:%M %p %Z')

## Extract Hashtags and Mentions
Isolate hashtags and mentions from the tweet text for separate analysis.

In [113]:
# membuat kolom baru untuk menyimpan hastag
df['hastag'] = df['text'].str.findall(r'#.*?(?=\s|$)')

# Membuat kolom baru untuk menyimpan teks tanpa hastag dengan re sub
df['text'] = df['text'].apply(lambda x: re.sub(r'#.*?(?=\s|$)', '', x))

In [114]:
# Membuat kolom baru untuk menyimpan mention
df['mentions'] = df['text'].apply(lambda text: re.findall(r'@\w+', text))

# Membuat kolom baru untuk menyimpan teks tanpa mention
df['text'] = df['text'].apply(lambda text: re.sub(r'@\w+', '', text))

## Deduplicate Text
Remove tweets that are exact duplicates to ensure the uniqueness of the dataset.

In [115]:
# hapus text dengan kemiripan 100%
df = df.drop_duplicates(subset='text', keep='first')

## Load and Apply Text Normalization Resources
Read in resources to normalize contractions and slangs, and apply these to the tweet text.

In [116]:
contractions_df = pd.read_csv('contractions.csv')
slangs_df = pd.read_csv('slangs.csv', index_col=0)

contractions_dict = dict(zip(contractions_df.key, contractions_df.value))
slangs_dict = dict(zip(slangs_df.Abbr, slangs_df.Fullform))

## Clean and Normalize Text
Perform a series of text cleaning steps including emoji removal, URL stripping, and lowering case.

In [117]:
df['text'] = df['text'].str.strip()

def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  
                               u"\U0001F300-\U0001F5FF"  
                               u"\U0001F680-\U0001F6FF"  
                               u"\U0001F700-\U0001F77F"  
                               u"\U0001F780-\U0001F7FF"  
                               u"\U0001F800-\U0001F8FF"  
                               u"\U0001F900-\U0001F9FF"  
                               u"\U0001FA00-\U0001FA6F"  
                               u"\U0001FA70-\U0001FAFF"  
                               u"\U0001FB00-\U0001FBFF"  
                               u"\U0001FC00-\U0001FCFF"  
                               u"\U0001F004-\U0001F0CF"  
                               u"\U0001F18E-\U0001F251"  
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


In [None]:

df['text'] = df['text'].apply(remove_emojis)

df['text'] = df['text'].apply(lambda text: re.sub(r'http\S+', '', text))

df['text'] = df['text'].apply(lambda text: re.sub(r'\d+', '', text))

df['text'] = df['text'].apply(lambda text: re.sub(r'[^\w\s]', ' ', text))

df['text'] = df['text'].str.lower()


In [None]:
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda text: ' '.join([word for word in word_tokenize(text) if word not in stop_words]))

def normalize_text(text):
    words = text.split()
    normalized_words = [contractions_dict.get(word, word) for word in words]
    normalized_words = [slangs_dict.get(word, word) for word in normalized_words]
    return ' '.join(normalized_words)

df['text'] = df['text'].apply(normalize_text)

df['text'] = df['text'].apply(lambda text: ' '.join([word for word in text.split() if len(word) > 2]))

df['text'] = df['text'].apply(lambda text: ' '.join([word for word in text.split() if word.isalpha()]))

# df['text'] = df['text'].apply(lambda text: ' '.join([Word(word).correct() for word in text.split()]))

## Tokenize and Lemmatize Text
Tokenize the tweet text and lemmatize each token to reduce words to their base form.

In [None]:
df['text'] = df['text'].apply(word_tokenize)

# nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

## Create Unigrams and Bigrams
Generate unigrams and bigrams from the text to capture single words and two-word combinations.

In [118]:
column_to_use = 'text'

# Define a function to create unigrams and bigrams
# Unigrams are the words themselves (1-gram), and bigrams will be created by joining words with an underscore
def create_unigrams_and_bigrams(text_list):
    unigrams = text_list
    bigrams = ['_'.join(pair) for pair in zip(text_list, text_list[1:])]
    return unigrams + bigrams

# Apply the function to the dataframe
df['text_ngrams'] = df[column_to_use].apply(create_unigrams_and_bigrams)

## Remove Empty Text Entries
Discard any rows where the text has been reduced to an empty list after processing.

In [119]:
# drop yang text berisi []
df = df[df['text_ngrams'].map(len) > 0]

## Save the Cleaned Data
The cleaned and processed data is saved to a CSV file for further analysis.

In [120]:
# df.to_csv('twitter_cleaned_6.csv', index=False)