#### Modules

In [1]:
import pandas as pd
import numpy as np

import json
import re
import string
from datetime import datetime

#### Importing Data

In [2]:
data = json.load(open('./data/finalDataFixREVISI.json'))

In [11]:
df = pd.json_normalize(data)

#### Data Lookup

In [12]:
df.columns

Index(['text', 'hashtags', 'created_at', 'reference_type', 'lang',
       'users.username', 'users.following', 'users.tweets', 'users.location',
       'users.verified', 'metrics.retweets', 'metrics.replies',
       'metrics.quotes', 'metrics.impressions', 'entities.mentions',
       'entities.url title', 'places.name', 'places.place_type',
       'places.country', 'places.country_code'],
      dtype='object')

In [13]:
df.shape

(312463, 20)

#### Data Cleaning #1

In [14]:
df.head(4)

Unnamed: 0,text,hashtags,created_at,reference_type,lang,users.username,users.following,users.tweets,users.location,users.verified,metrics.retweets,metrics.replies,metrics.quotes,metrics.impressions,entities.mentions,entities.url title,places.name,places.place_type,places.country,places.country_code
0,RT @geloraco: Buzzer Kasihan 2000 Karyawan Hol...,,2022-06-29T23:56:26.000Z,[retweeted],in,ZefriF,23.0,110.0,,False,635,0,0,0,[geloraco],[],,,,
1,@zomet13 Emangnya kamu belain waktu 2000 karya...,,2022-06-29T23:53:44.000Z,[replied_to],in,GYatmoko,15567.0,35473.0,Indonesia,False,0,1,0,0,[zomet13],,,,,
2,"RT @geloraco: Ribuan Pegawai Terancam PHK, Pol...",,2022-06-29T23:51:39.000Z,[retweeted],in,joko_wijiyono,70.0,6555.0,,False,123,0,0,0,[geloraco],[],,,,
3,RT @geloraco: Buzzer Kasihan 2000 Karyawan Hol...,,2022-06-29T23:50:04.000Z,[retweeted],in,drysblack,42.0,2297.0,"Pulo Gadung, Indonesia",False,635,0,0,0,[geloraco],[],,,,


In [15]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def cleanTweet(tweet):
    tweet = re.sub("(\w*\d\w*)", "", tweet)
    tweet = re.sub("(RT )|(\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)|(&amp)|(https?:\/\/\S*)|(@([a-zA-Z0-9_]{1,50}))|(#([a-zA-Z0-9_]{1,50})|(\d+))", "", tweet)
    tweet = re.sub("(,)|(\n)|(…)|(”)|(“)|(²)|(×)|(⏬)|(„)|(•)|(–)|(‼)|(⁉)|(©)|(⁣⋱)|(⋮)|(⋰)|(⋯)|(¸)|(¨)|(’)|(‘)|(—)", " ", tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tweet = remove_emojis(tweet)
    tweet = tweet.strip()
    tweet = tweet.lower()
    tweet = ' '.join(tweet.split())
    
    return tweet

In [16]:
df['entities.mentions'], df['entities.url title'] = df['entities.mentions'].apply(lambda d: d if isinstance(d, list) else []), df['entities.url title'].apply(lambda d: d if isinstance(d, list) else [])
df.sort_values(by='created_at', ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [17]:
df['created_at'] = df['created_at'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f%z').strftime('%d-%m-%Y %H:%M:%S'))
df['text'] = df['text'].apply(lambda x: cleanTweet(x))
df = df[df['lang'] == 'in'].reset_index(drop=True)
df['users.verified'] = df['users.verified'].fillna('False')
df['hashtags'], df['entities.mentions'] = df['hashtags'].apply(lambda x: list(map(lambda y: y.lower(), x)) if x is not None else None), df['entities.mentions'].apply(lambda x: list(map(lambda y: y.lower(), x)) if x is not None else None)

df.replace(to_replace='', value=None, inplace=True)
df.dropna(subset=['text'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [20]:
df.tail(2)

Unnamed: 0,text,hashtags,created_at,reference_type,lang,users.username,users.following,users.tweets,users.location,users.verified,metrics.retweets,metrics.replies,metrics.quotes,metrics.impressions,entities.mentions,entities.url title,places.name,places.place_type,places.country,places.country_code
311623,sebelumnya dia minta duit dan minta jajan ke a...,,30-03-2023 23:52:35,[retweeted],in,njaesab,946.0,16434.0,"Sidoarjo, jawa timur",False,257,0,0,0,[tanyakanrl],[],,,,
311624,sebelumnya dia minta duit dan minta jajan ke a...,,30-03-2023 23:56:59,[retweeted],in,sipalingorange,287.0,6167.0,,False,257,0,0,0,[tanyakanrl],[],,,,


In [21]:
df.shape

(311625, 20)

In [22]:
new_df = df.drop_duplicates(subset=['text']).reset_index(drop=True)
tweets_w_hashtags = pd.DataFrame(new_df[new_df.hashtags.notnull()]).reset_index(drop=True)
hashtags_wo_rt = pd.DataFrame(new_df[new_df.reference_type.isnull()]).reset_index(drop=True)
tweets_w_hashtags_wo_rt = pd.DataFrame(hashtags_wo_rt[hashtags_wo_rt.hashtags.notnull()]).reset_index(drop=True)

In [23]:
print(f"Tweets without text duplicates: {len(new_df)}")
print(f"Tweets with hashtags: {len(tweets_w_hashtags)}")
print(f"Pure tweets (w/o RT/QT/Reply): {len(hashtags_wo_rt)}")
print(f"Pure tweets (w/o RT/QT/Reply) with hashtags: {len(tweets_w_hashtags_wo_rt)}")

Tweets without text duplicates: 98743
Tweets with hashtags: 11867
Pure tweets (w/o RT/QT/Reply): 40867
Pure tweets (w/o RT/QT/Reply) with hashtags: 9762


In [24]:
new_df['hashtags'][1], new_df['reference_type'][1]

(None, ['retweeted'])

In [26]:
new_df.shape

(98743, 20)

In [27]:
tweets_w_hashtags

Unnamed: 0,text,hashtags,created_at,reference_type,lang,users.username,users.following,users.tweets,users.location,users.verified,metrics.retweets,metrics.replies,metrics.quotes,metrics.impressions,entities.mentions,entities.url title,places.name,places.place_type,places.country,places.country_code
0,banding ukt turunnya gaji hingga phk banyak di...,"[himitpens, kabinetagnibrata, forzainformatika...",01-01-2022 14:03:02,,in,himitpens,156.0,4960.0,Surabaya,False,0,1,0,0,[],[],,,,
1,ada suara kemarin tentang pembubaran nasib phk...,[brin],02-01-2022 08:00:54,[quoted],in,dir1ku,3254.0,65752.0,Malang,False,1,0,0,0,[brin_indonesia],[],,,,
2,di tangan panas skidipap lembaga eijkman berha...,[kumparansains],02-01-2022 08:40:35,[quoted],in,awLdh,666.0,82112.0,,False,0,1,0,0,[],[],"Jati, Indonesia",city,Indonesia,ID
3,dd biaya pernikahan dibwh tahun sakit tdk puny...,"[danadarurat, shilafintips]",02-01-2022 09:08:39,,in,bregaswaras_id,16.0,15287.0,,False,0,0,0,0,[],[],,,,
4,mengakhiri tahun dengan menyampaikan laporan k...,[dprd_cilegon],02-01-2022 15:31:03,,in,faturohmi,423.0,7490.0,"Cilegon, banten",False,1,1,0,0,[gerindra],[],,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11862,sudah diphk nggak dapet pesangon pula pil pahi...,[terminalmojok],30-03-2023 08:45:44,,in,mojokdotco,489.0,132062.0,redaksi@mojok.co,False,1,0,0,2535,[],[Balada Korban PHK Sepihak Shox Rumahan Merebu...,,,,
11863,kembali pada isu utama yang lebih berbahaya ya...,[usuttuntas],30-03-2023 08:59:55,,in,Neilarmstronkk,782.0,3182.0,Bdg,False,0,0,0,37,[],[],,,,
11864,maja people ada kabar kurang enak dari perusah...,"[majalabs, waltdisney, disney, metaverse, web3...",30-03-2023 10:00:12,,in,majalabs_xyz,79.0,1176.0,Indonesia,False,0,1,0,40,[],[],,,,
11865,startup berbasis arisan online shox rumahan ba...,"[bisnisupdate, update, bisnis, oneliner]",30-03-2023 13:01:31,,in,kumparan,3.0,953869.0,,True,0,0,1,1963,[],[],,,,


In [28]:
df.to_csv('./data/initial_data_REVISi.csv', index=False)
new_df.to_csv('./data/data_non_duplicate_tweets_REVISI.csv', index=False)