I will load the tweet dataset and the slang dataset. In the tweet dataset and emoji dataset, I will delete columns that are not needed. Meanwhile, in the slang dataset, I will change the data format to dictionary. The dictionary will then be saved in a JSON file to facilitate further data processing

# 🎯 Step 0: Import Library
---

In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import json

from itertools import islice


In [85]:
plt.style.use('seaborn-v0_8')

In [86]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 🎯 Step 1: Load Dataset
---

## ✨ 1.1 Dataset Tweet

In [87]:
path = '/content/drive/MyDrive/INDONESIA AI/Sentiment Analysis/Dataset/tweet.csv'

df_sentiment = pd.read_csv(path)

df_sentiment.head()

Unnamed: 0.1,Unnamed: 0,sentimen,tweet
0,0,negatif,Kata @prabowo Indonesia tidak dihargai bangsa ...
1,1,netral,"Batuan Langka, Tasbih Jokowi Hadiah dari Habib..."
2,2,netral,"Di era Jokowi, ekonomi Indonesia semakin baik...."
3,3,positif,"Bagi Sumatera Selatan, Asian Games berdampak p..."
4,4,negatif,Negara kita ngutang buat bngun infrastruktur y...


## ✨ 1.2 Dataset Emoji

In [88]:
path =  '/content/drive/MyDrive/INDONESIA AI/Sentiment Analysis/Dataset/emoji.csv'

df_emoji = pd.read_csv(path)

df_emoji.head()

Unnamed: 0.1,Unnamed: 0,emoji-symbol,meaning,meaning-indo,emoji-unicode,emoji-name,emoji-name-indo
0,0,😄,smiley,tersenyum,ðŸ˜„,grinning face with smiling eyes,wajah menyeringai dengan mata tersenyum
1,1,😃,smiley,tersenyum,ðŸ˜ƒ,grinning face with big eyes,wajah menyeringai dengan mata besar
2,2,😀,smiley,tersenyum,ðŸ˜€,grinning face,wajah menyeringai
3,3,😊,smiley,tersenyum,ðŸ˜Š,smiling face with smiling eyes,wajah tersenyum dengan mata tersenyum
4,4,🙂,blush,memerah,â˜ºï¸,slightly smiling face,Wajah yang sedikit tersenyum


## ✨ 1.3 Dataset Slang

### 1.3.1 Slang Words 1

In [89]:
path = '/content/drive/MyDrive/INDONESIA AI/Sentiment Analysis/Dataset/Slang Words/slang_words1.txt'

slang_words1 = open(path, "r")

slang_words1 = slang_words1.read()

slang_words1

'{\'@\': \'di\', \'abis\': \'habis\', \'ad\': \'ada\', \'adlh\': \'adalah\', \'afaik\': \'as far as i know\', \'ahaha\': \'haha\', \'aj\': \'saja\', \'ajep-ajep\': \'dunia gemerlap\', \'ak\': \'saya\', \'akika\': \'aku\', \'akkoh\': \'aku\', \'akuwh\': \'aku\', \'alay\': \'norak\', \'alow\': \'halo\', \'ambilin\': \'ambilkan\', \'ancur\': \'hancur\', \'anjrit\': \'anjing\', \'anter\': \'antar\', \'ap2\': \'apa-apa\', \'apasih\': \'apa sih\', \'apes\': \'sial\', \'aps\': \'apa\', \'aq\': \'saya\', \'aquwh\': \'aku\', \'asbun\': \'asal bunyi\', \'aseekk\': \'asyik\', \'asekk\': \'asyik\', \'asem\': \'asam\', \'aspal\': \'asli tetapi palsu\', \'astul\': \'asal tulis\', \'ato\': \'atau\', \'au ah\': \'tidak mau tahu\', \'awak\': \'saya\', \'ay\': \'sayang\', \'ayank\': \'sayang\', \'b4\': \'sebelum\', \'bakalan\': \'akan\', \'bandes\': \'bantuan desa\', \'bangedh\': \'banget\', \'banpol\': \'bantuan polisi\', \'banpur\': \'bantuan tempur\', \'basbang\': \'basi\', \'bcanda\': \'bercanda\', 

In [90]:
slang_words1 = ast.literal_eval(slang_words1)

for key, value in islice(slang_words1.items(), 5):

    print(key, value)

@ di
abis habis
ad ada
adlh adalah
afaik as far as i know


### 1.3.2 Slang Words 2

In [91]:
path = '/content/drive/MyDrive/INDONESIA AI/Sentiment Analysis/Dataset/Slang Words/slang_words2.csv'

slang_words2 = pd.read_csv(path, sep=';', header=None, names=['slang_word', 'standard_word'])

slang_words2.head()

Unnamed: 0,slang_word,standard_word
0,aamiin,amin
1,adek,adik
2,adlh,adalah
3,aer,air
4,aiskrim,es krim


In [92]:
# Change the data format to dictionary
# slang_word column -> key
# standard_word column -> key

slang_words2 = slang_words2.set_index('slang_word')['standard_word'].to_dict()

for key, value in islice(slang_words2.items(), 5):

    print(key, value)

aamiin amin 
adek adik 
adlh adalah 
aer air 
aiskrim es krim 


### 1.3.3 Slang Words 3

In [93]:
path = '/content/drive/MyDrive/INDONESIA AI/Sentiment Analysis/Dataset/Slang Words/slang_words3.csv'

_slang_words3 = pd.read_csv(path, sep='\t')

_slang_words3.head()

Unnamed: 0,"slang,formal"
0,"woww,wow"
1,"aminn,amin"
2,"met,selamat"
3,"netaas,menetas"
4,"keberpa,keberapa"


In [94]:
slang_words3 = {}

for row in _slang_words3.values:

  slang_word, standard_word = row[0].split(',')

  slang_words3[slang_word] = standard_word

In [95]:
for key, value in islice(slang_words3.items(), 5):

    print(key, value)

woww wow
aminn amin
met selamat
netaas menetas
keberpa keberapa


# 🎯 Step 2: Preprocessing Dataset

---



## ✨ 2.1 Dataset Tweet

In [96]:
column = df_sentiment.columns

df_sentiment.drop(column[0], axis=1, inplace=True)

df_sentiment.head()

Unnamed: 0,sentimen,tweet
0,negatif,Kata @prabowo Indonesia tidak dihargai bangsa ...
1,netral,"Batuan Langka, Tasbih Jokowi Hadiah dari Habib..."
2,netral,"Di era Jokowi, ekonomi Indonesia semakin baik...."
3,positif,"Bagi Sumatera Selatan, Asian Games berdampak p..."
4,negatif,Negara kita ngutang buat bngun infrastruktur y...


## ✨ 2.2 Dataset Emoji

In [98]:
column = df_emoji.columns

df_emoji.drop(column[0], axis=1, inplace=True)

df_emoji.head()

Unnamed: 0,emoji-symbol,meaning,meaning-indo,emoji-unicode,emoji-name,emoji-name-indo
0,😄,smiley,tersenyum,ðŸ˜„,grinning face with smiling eyes,wajah menyeringai dengan mata tersenyum
1,😃,smiley,tersenyum,ðŸ˜ƒ,grinning face with big eyes,wajah menyeringai dengan mata besar
2,😀,smiley,tersenyum,ðŸ˜€,grinning face,wajah menyeringai
3,😊,smiley,tersenyum,ðŸ˜Š,smiling face with smiling eyes,wajah tersenyum dengan mata tersenyum
4,🙂,blush,memerah,â˜ºï¸,slightly smiling face,Wajah yang sedikit tersenyum


# 🎯 Step 3: Save the Dataset

---


## ✨ 3.1 Dataset Tweet

In [99]:
file_path = '/content/drive/MyDrive/INDONESIA AI/Sentiment Analysis/Dataset/df_tweet.csv'

df_sentiment.to_csv(file_path, index=False)

## ✨ 3.2 Dataset Emoji

In [100]:
file_path = '/content/drive/MyDrive/INDONESIA AI/Sentiment Analysis/Dataset/df_emoji.csv'

df_emoji.to_csv(file_path, index=False)

## ✨ 3.3 Dataset Slang

In [101]:
def dict_to_json(data, file_name):

  file_path = '/content/drive/MyDrive/INDONESIA AI/Sentiment Analysis/Dataset/Slang Words/' + file_name

  try:

    with open(file_path, 'w') as json_file:

      json.dump(data, json_file)

    print(f'Data successfully saved to file: {file_path}')

  except Exception as e:

    print(f'Error {e}')


In [102]:
dict_to_json(slang_words1, 'json_slang_words1')
dict_to_json(slang_words2, 'json_slang_words2')
dict_to_json(slang_words3, 'json_slang_words3')

Data successfully saved to file: /content/drive/MyDrive/INDONESIA AI/Sentiment Analysis/Dataset/Slang Words/json_slang_words1
Data successfully saved to file: /content/drive/MyDrive/INDONESIA AI/Sentiment Analysis/Dataset/Slang Words/json_slang_words2
Data successfully saved to file: /content/drive/MyDrive/INDONESIA AI/Sentiment Analysis/Dataset/Slang Words/json_slang_words3
