### 1 - Import packages

In [43]:
""" Package for language translation """
!pip install deep-translator



In [72]:
import pandas as pd
import html

import re

from deep_translator import GoogleTranslator

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jewel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jewel\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jewel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jewel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 2 - Import Data

In [4]:
insta_df = pd.read_csv("instagram_hashtag_results_17Mar_cleaned.csv")

In [6]:
insta_df.isnull().sum()

postUrl             0
profileUrl          0
username            0
fullName           62
commentCount        0
likeCount           0
pubDate             0
description         3
imgUrl            247
postId              0
ownerId             0
type                0
query               0
timestamp           0
location         1451
isSidecar           0
sidecarMedias    1313
videoUrl         1671
viewCount        1692
dtype: int64

### 3 - Drop rows without captions

In [7]:
real_insta_df = insta_df[~insta_df.description.isnull()]

### 4 - Clean captions

1. Remove any HTML tags
2. Translate caption from other language to English
3. Shorten words to their basic form while keeping the original meaning
4. Remove stopwords so that our topic modeling will be more meaningful
5. Store clean captions in a new column
6. Store clean caption as a list of words in another new column

In [73]:
### 4 - Helper functions for data preprocessing
"""
Code references:
    https://pythonguides.com/remove-unicode-characters-in-python/
    https://www.kite.com/python/answers/how-to-decode-html-entities-in-python
"""
def decode_text(text):
    # remove non-ASCII characters in string
    decoded_text = text.encode('ascii', 'ignore').decode('utf-8')

    # decode HTML entities
    decoded_html = html.unescape(decoded_text)
    return ''.join([word for word in decoded_html if word.isprintable()])

"""
Code reference:
    https://medium.com/analytics-vidhya/how-to-translate-text-with-python-9d203139dcf5
"""
def translate_text(text):
    translated_text = GoogleTranslator(target='en').translate(text)
    return translated_text
"""
Code reference:
    https://catriscode.com/2021/03/02/extracting-or-removing-mentions-and-hashtags-in-tweets-using-python/
"""
# def remove_mentions(text):
#     return re.sub("@[A-Za-z0-9_]+","", text)

def remove_stopwords(words_list):
    stop_list = stopwords.words("english")
    return [word for word in words_list if word not in stop_list]

def pos_to_wordnet(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_words(word_list):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    # POS (part-of-speech) tagging
    # nltk_tagged -> a list of tuples (word, pos tag)
    nltk_tagged = nltk.pos_tag(word_list)

    # returns a list of tuples of words and their wordnet_tag (after conversion from NLTK tag)
    wordnet_tagged = list(map(lambda x: (x[0], pos_to_wordnet(x[1])), nltk_tagged))

    # lemmatizing
    lemmatized_words = []
    for word, tag in wordnet_tagged:
        if tag is not None:
            # need POS tag as 2nd argument as it helps lemmatize the words more accurately
            lemmatized_words.append(lemmatizer.lemmatize(word, tag))
        elif tag in [wordnet.NOUN] or word == "lavval" or word == "newagefsg":
            lemmatized_words.append(lemmatizer.lemmatize(word))
    return lemmatized_words

def clean_original_text(text):
    text = text.lower()
    clean_list = []
    sentence_list = nltk.sent_tokenize(text)
    for sentence in sentence_list:
        decoded_sentence = decode_text(sentence)
        translated_sentence = translate_text(decoded_sentence)
        if translated_sentence is not None:
            words_list = nltk.RegexpTokenizer(r'\w+').tokenize(translated_sentence)
            lemmatized_words = lemmatize_words(words_list)
            useful_words = remove_stopwords(lemmatized_words)

            if len(useful_words) > 0:
                clean_list.extend(useful_words)
    clean_text = ' '.join(clean_list)

    return clean_text

In [63]:
real_insta_df["clean_description"] = real_insta_df["description"].apply(clean_original_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_insta_df["clean_description"] = real_insta_df["description"].apply(clean_original_text)


In [67]:
real_insta_df["clean_tokens"] = real_insta_df["clean_description"].apply(nltk.word_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_insta_df["clean_tokens"] = real_insta_df["clean_description"].apply(nltk.word_tokenize)


In [68]:
real_insta_df

Unnamed: 0,postUrl,profileUrl,username,fullName,commentCount,likeCount,pubDate,description,imgUrl,postId,...,type,query,timestamp,location,isSidecar,sidecarMedias,videoUrl,viewCount,clean_description,clean_tokens
0,https://www.instagram.com/p/CbKYWL8AuAF/,https://www.instagram.com/leassteinwelt,leassteinwelt,💎Kristall Paradies💎,9,501,2022-03-16T10:58:16.000Z,Wir haben schon die letzten Tage super fleißig...,https://scontent-yyz1-1.cdninstagram.com/v/t51...,2795153596499550000,...,Photo,#crystals,2022-03-17T01:27:17.629Z,,False,,,,work hard past day prepare mega restock time n...,"[work, hard, past, day, prepare, mega, restock..."
1,https://www.instagram.com/p/CbK9hAvhxjC/,https://www.instagram.com/awakenstardust,awakenstardust,Bring Love to People❤,26,1157,2022-03-16T16:23:03.000Z,Drop a 👁️ if you can relate\r\nFollow @awakens...,https://scontent-yyz1-1.cdninstagram.com/v/t51...,2795317068114106400,...,Photo,#crystals,2022-03-17T01:27:17.629Z,,False,,,,drop relatefollow awakenstardust spiritualawak...,"[drop, relatefollow, awakenstardust, spiritual..."
2,https://www.instagram.com/p/CbKJdGUj_gg/,https://www.instagram.com/praktijk_matricaria,praktijk_matricaria,Praktijk Matricaria,26,181,2022-03-16T08:48:08.000Z,Spirit kwarts 💕\r\nEindelijk! Na een lange zoe...,https://scontent-yyz1-1.cdninstagram.com/v/t51...,2795088100807670000,...,Photo,#crystals,2022-03-17T01:27:17.629Z,,False,,,,spirit quartz finally long search find beautif...,"[spirit, quartz, finally, long, search, find, ..."
3,https://www.instagram.com/p/CbHDHKrPN1X/,https://www.instagram.com/nickycrystcollection,nickycrystcollection,,18,1438,2022-03-15T03:54:59.000Z,Beautiful pink amethyst freedom from @elo.di_ ...,https://scontent-yyz1-1.cdninstagram.com/v/t51...,2794215780445380000,...,Photo,#crystals,2022-03-17T01:27:17.629Z,,False,,,,beautiful pink amethyst freedom elo di_ druzy ...,"[beautiful, pink, amethyst, freedom, elo, di_,..."
4,https://www.instagram.com/p/CbIAdAXupdV/,https://www.instagram.com/sacredsoulfulshop,sacredsoulfulshop,Crystals &amp; Wellness,9,150,2022-03-15T12:51:00.000Z,We’re you able to scoop up some Orca Agate las...,https://scontent-yyz1-1.cdninstagram.com/v/t51...,2794485562046322700,...,Photo,#crystals,2022-03-17T01:27:17.629Z,"Vaughan, Ontario",False,,,,able scoop orca agate last week dont worry sur...,"[able, scoop, orca, agate, last, week, dont, w..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1913,https://www.instagram.com/p/CbK3NCerlnY/,https://www.instagram.com/positiveknotsjewelry,positiveknotsjewelry,🌻Positive Knots Jewelry🧘‍♀️,1,15,2022-03-16T15:27:54.000Z,Come visit me this Saturday at the @tacomacomm...,https://scontent-amt2-1.cdninstagram.com/v/t51...,2795289307310348760,...,Photo,#crystaljewelry,2022-03-17T02:04:53.632Z,Tacoma Arts Community Center,False,,,,come visit saturday tacomacommunitycenter hill...,"[come, visit, saturday, tacomacommunitycenter,..."
1914,https://www.instagram.com/p/CbK3G15j012/,https://www.instagram.com/breezydaystudio,breezydaystudio,Breezy Day Studio,0,3,2022-03-16T15:27:03.000Z,March Madness SALE 40% OFF \r\n\r\n🌿Shop Link ...,https://scontent-ams4-1.cdninstagram.com/v/t51...,2795288881485794678,...,Carousel,#crystaljewelry,2022-03-17T02:04:53.632Z,"Penticton, British Columbia",True,3.0,,,march madness sale shop link breezydaystudio d...,"[march, madness, sale, shop, link, breezydayst..."
1915,https://www.instagram.com/p/CbK3QDTOcN8/,https://www.instagram.com/opossum_doodles,opossum_doodles,Remy's art,3,15,2022-03-16T15:28:18.000Z,🌈 tear-drop oplaite earrings 🌈\r\n\r\nSpring s...,https://scontent-amt2-1.cdninstagram.com/v/t51...,2795289514350330748,...,Photo,#crystaljewelry,2022-03-17T02:04:53.632Z,,False,,,,tear drop oplaite earring spring sale march it...,"[tear, drop, oplaite, earring, spring, sale, m..."
1916,https://www.instagram.com/p/CbK24RZt8UK/,https://www.instagram.com/empoweredcrystals,empoweredcrystals,Empowered Crystals,0,47,2022-03-16T15:25:16.000Z,A few blue kyanite rings. A wonderful cleansin...,https://scontent-ams4-1.cdninstagram.com/v/t51...,2795287880224195850,...,Photo,#crystaljewelry,2022-03-17T02:04:53.632Z,,False,,,,blue kyanite ring wonderful cleansing stone cr...,"[blue, kyanite, ring, wonderful, cleansing, st..."


### 5 - Save cleaned data to csv file

In [74]:
real_insta_df.to_csv("instagram_crystal_hashtag_clean.csv")