## What is sentiment analysis:Supervised learning.
Essentially, sentiment analysis or sentiment classification fall into the broad category of text classification tasks where you are supplied with a phrase, or a list of phrases and your classifier is supposed to tell if the sentiment behind that is positive, negative or neutral. Sometimes, the third attribute is not taken to keep it a binary classification problem. In recent tasks, sentiments like "somewhat positive" and "somewhat negative" are also being considered.

In [3]:
import pandas as pd
#regular expressions
import re
#sentiment analysis package
#!pip install textblob
from textblob import TextBlob

#general text pre-processor
#!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')

#tweet pre-processor 
#!pip install tweet-preprocessor
import preprocessor as p

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bessy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
###Cleaning the tweets,
def clean_tweets(data_file,text_column):
    data_file = pd.read_csv(data_file)
    cleaned_text_list = []
    for text in data_file[text_column]:
        #use pre processor
        tweet = p.clean(text)

         #HappyEmoticons
        emoticons_happy = set([
            ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
            ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
            '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
            'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
            '<3'
            ])

        # Sad Emoticons
        emoticons_sad = set([
            ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
            ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
            ':c', ':{', '>:\\', ';('
            ])

        #Emoji patterns
        emoji_pattern = re.compile("["
                 u"\U0001F600-\U0001F64F"  # emoticons
                 u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                 u"\U0001F680-\U0001F6FF"  # transport & map symbols
                 u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                 u"\U00002702-\U000027B0"
                 u"\U000024C2-\U0001F251"
                 "]+", flags=re.UNICODE)

        #combine sad and happy emoticons
        emoticons = emoticons_happy.union(emoticons_sad)

        stop_words = set(stopwords.words('english'))
        word_tokens = nltk.word_tokenize(tweet)

        #after tweepy preprocessing the colon symbol left remain after      
        #removing mentions
        tweet = re.sub(r':', '', tweet)
        tweet = re.sub(r'‚Ä¶', '', tweet)

        #replace consecutive non-ASCII characters with a space
        tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)

        #remove emojis from tweet
        tweet = emoji_pattern.sub(r'', tweet)
        #filter using NLTK library append it to a string
        filtered_tweet = [w for w in word_tokens if not w in stop_words]

        #looping through conditions
        filtered_tweet = []    
        for w in word_tokens:
        #check tokens against stop words , emoticons and punctuations
            if w not in stop_words and w not in emoticons and w not in string.punctuation:
                filtered_tweet.append(w)
        tweet = ' '.join(filtered_tweet)
                
        cleaned_text_list.append(tweet)
    data_file.loc[:,'cleaned_text'] = cleaned_text_list
    
    return data_file

In [6]:
import string
test = clean_tweets("food poisoning.csv","text")

In [7]:
test.head()

Unnamed: 0.1,Unnamed: 0,user,time,location,text,cleaned_text
0,0,JChapman1729,2020-10-01 15:34:45,"Hexham, England",I’m with @whichuk. @DefraGovUK must #SaveFoodS...,Im must Agriculture Bill Cheap food n't put he...
1,1,TerriEClary,2020-10-01 15:33:35,"California, USA",Anyone ever think that China has been poisonin...,Anyone ever think China poisoning us decades O...
2,2,mehulved,2020-10-01 15:32:56,On a cycle somewhere.,@BorkTales Subway is a scam anyway. I got food...,Subway scam anyway I got food poisoning last t...
3,3,shrtypnd,2020-10-01 15:32:11,,"Aku redho kalau dapat food poisoning esok,, ha...",Aku redho kalau dapat food poisoning esok haih...
4,4,Ajk01249750,2020-10-01 15:32:10,,I’m with @whichuk. @DefraGovUK must #SaveFoodS...,Im must Agriculture Bill I n't want hormone injec
