In [37]:
import pandas as pd
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True


**Import dataset**

In [38]:
df = pd.read_csv('sample.csv')
df

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243,119240.0
...,...,...,...,...,...,...,...
88,119330,105859,True,Wed Oct 11 13:50:42 +0000 2017,@105860 I wish Amazon had an option of where I...,119329,119331.0
89,119331,105860,True,Wed Oct 11 13:47:14 +0000 2017,They reschedule my shit for tomorrow https://t...,119330,
90,119332,Tesco,False,Wed Oct 11 13:34:06 +0000 2017,"@105861 Hey Sara, sorry to hear of the issues ...",119333,119334.0
91,119333,105861,True,Wed Oct 11 14:05:18 +0000 2017,@Tesco bit of both - finding the layout cumber...,119335119336,119332.0


**extract text only**

In [33]:
text_data = df['text']
text_data.head()

0    @AppleSupport causing the reply to be disregar...
1    @105835 Your business means a lot to us. Pleas...
2    @76328 I really hope you all change but I'm su...
3    @105836 LiveChat is online at the moment - htt...
4    @VirginTrains see attached error message. I've...
Name: text, dtype: object

**Functions for preprocessing that includes  
▶Function for removing emojis  
▶Function for removing punctuation ,Tokenization,Remove stopwords,Stemming (using Porter Stemmer),Lemmatization (using WordNet Lemmatizer)**

In [39]:
import re

def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F700-\U0001F77F"  # alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                           u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           u"\U0001F004-\U0001F0CF"  # Miscellaneous Symbols and Pictographs
                           u"\U0001F0D0-\U0001F0FF"  # Emoji for Legacy Computing
                           u"\U0001F1E0-\U0001F1FF"  # Regional Indicator Symbols
                           u"\U0001F200-\U0001F251"  # Enclosed Alphanumeric Supplement
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def preprocess_text(text):
   # Remove emojis
    text_removed_emojis = remove_emojis(text)
    # Remove punctuation
    text_removed_emojis_ponc  = ''.join([char for char in text_removed_emojis if char not in string.punctuation])

    # Tokenization
    words = word_tokenize(text_removed_emojis_ponc)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words_without_sw = [word for word in words if word.lower() not in stop_words]

    # Stemming (using Porter Stemmer)
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]

    # Lemmatization (using WordNet Lemmatizer)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Return the preprocessed text
    return {
        "Original Text": text,
        "Punctuation Removed": text_removed_emojis_ponc,
        "Tokenized": words,
        "Stopwords Removed": words_without_sw,
        "Stemmed": stemmed_words,
        "Lemmatized": lemmatized_words
    }

**Applying the preprocessing function to each text**

In [40]:
preprocessed_data = text_data.apply(preprocess_text)

# Print the preprocessed data
print(preprocessed_data)

0     {'Original Text': '@AppleSupport causing the r...
1     {'Original Text': '@105835 Your business means...
2     {'Original Text': '@76328 I really hope you al...
3     {'Original Text': '@105836 LiveChat is online ...
4     {'Original Text': '@VirginTrains see attached ...
                            ...                        
88    {'Original Text': '@105860 I wish Amazon had a...
89    {'Original Text': 'They reschedule my shit for...
90    {'Original Text': '@105861 Hey Sara, sorry to ...
91    {'Original Text': '@Tesco bit of both - findin...
92    {'Original Text': '@105861 If that doesn't hel...
Name: text, Length: 93, dtype: object


**First element**

In [41]:
# Get the first element (row) of the preprocessed data
first_element = preprocessed_data.iloc[0]

# Print each preprocessing step separately
print("Original Text:")
print(first_element["Original Text"])
print("\nText with Punctuation Removed:")
print(first_element["Punctuation Removed"])
print("\nTokenized Text:")
print(first_element["Tokenized"])
print("\nText with Stopwords Removed:")
print(first_element["Stopwords Removed"])
print("\nStemmed Text:")
print(first_element["Stemmed"])
print("\nLemmatized Text:")
print(first_element["Lemmatized"])

Original Text:
@AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is opened😡😡😡

Text with Punctuation Removed:
AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is opened

Tokenized Text:
['AppleSupport', 'causing', 'the', 'reply', 'to', 'be', 'disregarded', 'and', 'the', 'tapped', 'notification', 'under', 'the', 'keyboard', 'is', 'opened']

Text with Stopwords Removed:
['AppleSupport', 'causing', 'reply', 'disregarded', 'tapped', 'notification', 'keyboard', 'opened']

Stemmed Text:
['applesupport', 'caus', 'the', 'repli', 'to', 'be', 'disregard', 'and', 'the', 'tap', 'notif', 'under', 'the', 'keyboard', 'is', 'open']

Lemmatized Text:
['AppleSupport', 'causing', 'the', 'reply', 'to', 'be', 'disregarded', 'and', 'the', 'tapped', 'notification', 'under', 'the', 'keyboard', 'is', 'opened']
