### 1 - Import packages

In [1]:
""" Package for language translation """
!pip install deep-translator



In [3]:
import pandas as pd
import html

import re

from deep_translator import GoogleTranslator

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jewel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jewel\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jewel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jewel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 2 - Import Data

In [6]:
insta_df = pd.read_csv("instagram_hashtag_results_crystalsg_cleaned.csv")

In [7]:
insta_df.isnull().sum()

postUrl             0
profileUrl          0
username            0
fullName           82
commentCount        0
likeCount           0
pubDate             0
description         0
imgUrl             98
postId              0
ownerId             0
type                0
query               0
timestamp           0
location         1606
isSidecar           0
sidecarMedias    1099
videoUrl         1900
viewCount        1901
dtype: int64

### 3 - Drop rows without captions

In [8]:
real_insta_df = insta_df[~insta_df.description.isnull()]

### 4 - Clean captions

1. Replace `\n` and `\t` with space, and remove any HTML tags
2. Translate caption from other language to English
3. Remove mentions from caption
4. Remove hashtags from caption
5. Shorten words to their basic form while keeping the original meaning
6. Remove stopwords so that our topic modeling will be more meaningful
7. Store clean captions in a new column
8. Store clean caption as a list of words in another new column

In [36]:
### 4 - Helper functions for data preprocessing
"""
Code references:
    https://pythonguides.com/remove-unicode-characters-in-python/
    https://www.kite.com/python/answers/how-to-decode-html-entities-in-python
"""
def decode_text(text):
    # remove non-ASCII characters in string
    decoded_text = text.encode('ascii', 'ignore').decode('utf-8')
    spaced_text = re.sub("\n", " ", decoded_text)
    spaced_text2 = re.sub("\t", " ", spaced_text)
    # decode HTML entities
    decoded_html = html.unescape(spaced_text2)
    words = [word for word in decoded_html if word.isprintable()]
    return ''.join([word for word in decoded_html if word.isprintable()])

"""
Code reference:
    https://medium.com/analytics-vidhya/how-to-translate-text-with-python-9d203139dcf5
"""
def translate_text(text):
    translated_text = GoogleTranslator(target='en').translate(text)
    return translated_text
"""
Code reference:
    https://catriscode.com/2021/03/02/extracting-or-removing-mentions-and-hashtags-in-tweets-using-python/
"""
def remove_mentions(text):
    return re.sub("@[A-Za-z0-9_]+","", text)

def remove_hashtags(text):
    return re.sub("#[A-Za-z0-9_]+", "", text)

def remove_stopwords(words_list):
    stop_list = stopwords.words("english")
    return [word for word in words_list if word not in stop_list]

def pos_to_wordnet(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_words(word_list):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    # POS (part-of-speech) tagging
    # nltk_tagged -> a list of tuples (word, pos tag)
    nltk_tagged = nltk.pos_tag(word_list)

    # returns a list of tuples of words and their wordnet_tag (after conversion from NLTK tag)
    wordnet_tagged = list(map(lambda x: (x[0], pos_to_wordnet(x[1])), nltk_tagged))

    # lemmatizing
    lemmatized_words = []
    for word, tag in wordnet_tagged:
        if tag is not None:
            # need POS tag as 2nd argument as it helps lemmatize the words more accurately
            lemmatized_words.append(lemmatizer.lemmatize(word, tag))
        elif tag in [wordnet.NOUN] or word == "lavval" or word == "newagefsg":
            lemmatized_words.append(lemmatizer.lemmatize(word))
    return lemmatized_words

## V2 - Remove mentions and hashtags from posts

In [38]:
def clean_original_text(text):
    text = text.lower()
    clean_list = []
    sentence_list = nltk.sent_tokenize(text)
    for sentence in sentence_list:
        decoded_sentence = decode_text(sentence)
        translated_sentence = translate_text(decoded_sentence)
        if translated_sentence is not None:
            mentionless_sentence = remove_mentions(translated_sentence)
            tagless_sentence = remove_hashtags(mentionless_sentence)
            words_list = nltk.RegexpTokenizer(r'\w+').tokenize(tagless_sentence)
            lemmatized_words = lemmatize_words(words_list)
            useful_words = remove_stopwords(lemmatized_words)

            if len(useful_words) > 0:
                clean_list.extend(useful_words)
    clean_text = ' '.join(clean_list)

    return clean_text

In [39]:
real_insta_df["clean_description"] = real_insta_df["description"].apply(clean_original_text)

In [40]:
real_insta_df["clean_tokens"] = real_insta_df["clean_description"].apply(nltk.word_tokenize)

In [41]:
real_insta_df

Unnamed: 0,postUrl,profileUrl,username,fullName,commentCount,likeCount,pubDate,description,imgUrl,postId,...,type,query,timestamp,location,isSidecar,sidecarMedias,videoUrl,viewCount,clean_description,clean_tokens
0,https://www.instagram.com/p/CaJQO5HPHO1/,https://www.instagram.com/petalsofarosez,petalsofarosez,Amanda Low,16,910,2022-02-19T03:56:44.000Z,You deserve a single shot! The night where I w...,https://scontent-iad3-2.cdninstagram.com/v/t51...,2776822037392160000,...,Photo,#crystalsg,2022-03-18T04:48:25.073Z,Singapore / Singapura / 新加坡 / சிங்கப்பூர்,False,,,,deserve single shot night appreciate beauty co...,"[deserve, single, shot, night, appreciate, bea..."
1,https://www.instagram.com/p/CZMdip8PCzX/,https://www.instagram.com/princessezyt,princessezyt,Triple R Mummy Princess,102,527,2022-01-26T13:19:21.000Z,"This CNY, get yourself or your love ones a NEW...",https://scontent-iad3-2.cdninstagram.com/v/t51...,2759710596516293600,...,Carousel,#crystalsg,2022-03-18T04:48:25.073Z,"PARKROYAL COLLECTION Marina Bay, Singapore",True,7.0,,,cny get love one new pretty crystal water bott...,"[cny, get, love, one, new, pretty, crystal, wa..."
2,https://www.instagram.com/p/CZviYgxvmz7/,https://www.instagram.com/moonwanderer.est2020,moonwanderer.est2020,moonwandererclub,9,711,2022-02-09T04:15:04.000Z,— my 💙 for you as deep as the ocean\n\nFeaturi...,https://scontent-iad3-2.cdninstagram.com/v/t51...,2769583513899265000,...,Carousel,#crystalsg,2022-03-18T04:48:25.073Z,Atlantic Ocean,True,3.0,,,deep ocean feature gemmy teal blue purple fluo...,"[deep, ocean, feature, gemmy, teal, blue, purp..."
3,https://www.instagram.com/p/CUaL0oNPhEm/,https://www.instagram.com/crystalliac,crystalliac,💫 Crystals Shop 💫,5,131,2021-09-29T14:36:55.000Z,• LET’S GO GREEN! • (PREVIEW)\nJoin us on Satu...,https://scontent-iad3-2.cdninstagram.com/v/t51...,2673501323894067700,...,Carousel,#crystalsg,2022-03-18T04:48:25.073Z,Singapore,True,9.0,,,let go green preview join saturday night octob...,"[let, go, green, preview, join, saturday, nigh..."
4,https://www.instagram.com/p/CW76Q2ZPwI9/,https://www.instagram.com/rehauscrystals,rehauscrystals,RehausCrystals,109,51204,2021-12-01T10:00:27.000Z,🍬[ℙ𝕚𝕟𝕜 ℝ𝕠𝕤𝕖 ℚ𝕦𝕒𝕣𝕥𝕫 𝕋𝕦𝕞𝕓𝕝𝕖𝕤]🍬\n\n💕Lovin these P...,https://scontent-iad3-2.cdninstagram.com/v/t51...,2719023044663116300,...,Carousel,#crystalsg,2022-03-18T04:48:25.074Z,,True,4.0,,,pink gemmy rise quartz tumble literal candy ri...,"[pink, gemmy, rise, quartz, tumble, literal, c..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1993,https://www.instagram.com/p/Ca5WLPNBANU/,https://www.instagram.com/crystal_stone_agate,crystal_stone_agate,Crystal_Stone_Agate,0,5,2022-03-09T20:12:12.000Z,🆕𝔸𝕔𝕔𝕖𝕡𝕥 𝕠𝕣𝕕𝕖𝕣\n\n✨𝙒𝙝𝙞𝙩𝙚 𝙙𝙧𝙪𝙯𝙮 𝙖𝙜𝙖𝙩𝙚.𝙢𝙚𝙧𝙢𝙖𝙞𝙙 𝙩𝙖...,https://scontent-iad3-2.cdninstagram.com/v/t51...,2790358973394715476,...,Photo,#crystalsg,2022-03-18T04:53:04.413Z,,False,,,,dm,[dm]
1994,https://www.instagram.com/p/Ca5WD0EBxVw/,https://www.instagram.com/crystal_stone_agate,crystal_stone_agate,Crystal_Stone_Agate,0,6,2022-03-09T20:11:11.000Z,🆕𝔸𝕔𝕔𝕖𝕡𝕥 𝕠𝕣𝕕𝕖𝕣\n\n✨𝙒𝙝𝙞𝙩𝙚 𝙙𝙧𝙪𝙯𝙮 𝙖𝙜𝙖𝙩𝙚.𝙢𝙚𝙧𝙢𝙖𝙞𝙙 𝙩𝙖...,https://scontent-iad3-2.cdninstagram.com/v/t51...,2790358463216555376,...,Photo,#crystalsg,2022-03-18T04:53:04.413Z,,False,,,,dm,[dm]
1995,https://www.instagram.com/p/Ca5UPA2hVWG/,https://www.instagram.com/internationalgemshop,internationalgemshop,International Gemshop,0,13,2022-03-09T19:55:14.000Z,Banded Bumblebee Calcite Spheres\nAvailable \n...,https://scontent-iad3-2.cdninstagram.com/v/t51...,2790350436769813894,...,Photo,#crystalsg,2022-03-18T04:53:04.413Z,"Peshawar, Pakistan",False,,,,band bumblebee calcite spheres available ship ...,"[band, bumblebee, calcite, spheres, available,..."
1996,https://www.instagram.com/p/Ca5QCWShv5v/,https://www.instagram.com/ervina_crystal123,ervina_crystal123,𝔼𝕣𝕧𝕚𝕟𝕒 𝕔𝕖𝕣𝕪𝕤𝕥𝕒𝕝𝟙𝟚𝟛,0,1,2022-03-09T19:18:33.000Z,🆕 Plume agate moon carving💎\n☑️AVAILABLE for s...,https://scontent-iad3-2.cdninstagram.com/v/t51...,2790331974249021039,...,Photo,#crystalsg,2022-03-18T04:53:04.413Z,,False,,,,plume agate moon carve available sale dm order,"[plume, agate, moon, carve, available, sale, d..."


### 5 - Save cleaned data to csv file

In [74]:
real_insta_df.to_csv("instagram_crystalsg_hashtag_clean_new.csv")