# Settings

## Imports

In [1]:
import sys,tweepy,csv,re, requests, json
import matplotlib.pyplot as plt
from dotenv import dotenv_values
import pandas as pd
import numpy as np
import os.path, time, re


from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import string
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gustavo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import warnings
warnings.filterwarnings('ignore') # We can suppress the warnings

# Getting Twitter

## Second API

In [4]:

def get_API():
    #token
    config = dotenv_values(".env")
    consumer_key = config["API_KEY"]
    consumer_secret = config["API_KEY_SECRET"]
    access_token = config["ACCESS_TOKEN"]
    access_token_secret = config["ACCESS_TOKEN_SECRET"]

    auth = tweepy.OAuth1UserHandler(
      consumer_key, 
      consumer_secret, 
      access_token, 
      access_token_secret
    )

    api = tweepy.API(auth) 
    
    print('API started...')
    
    return api



def get_tweets(query, n , p, file_name):
    
    tweets_dt = pd.DataFrame()
    file_name = f'{file_name}.bz2'
 
    if os.path.exists(file_name) == True: #First checking if database exists
        create_dt = time.strftime("%d/%m/%Y %H:%M:%S",time.strptime(time.ctime(os.path.getmtime(file_name))))
        print(f'Reading {file_name}, created at {create_dt}')
        tweets_dt = pd.read_csv(file_name)
        print(f'DataFrame with {len(tweets_dt)} Tweets')
        
    else:
        print(f'Getting tweets...')
        
        api = get_API()

        extracted_pages = []

        for page in tweepy.Cursor(api.search_tweets, 
                                    q=query, 
                                    lang="en",
                                    count=50).pages(5):
            extracted_pages.append(page)


        extracted_tweets_from_pages = []

        for page in extracted_pages:
            extracted_tweets_from_pages += page

        for i in range(len(extracted_tweets_from_pages)):
            tweet = pd.DataFrame.from_dict(extracted_tweets_from_pages[i]._json, orient="index").T.reset_index(drop = True)
            if i == 0:
                tweets_dt = tweet.copy()
            else:
                tweets_dt = pd.concat([tweets_dt, tweet])
            tweets_dt.reset_index(drop = True)
        
        if len(extracted_tweets_from_pages) > 0:
            tweets_dt.to_csv(file_name, index=False,compression='bz2')
            print(f'{len(extracted_tweets_from_pages)} Tweets found and saved')
        else:
            print(f'No Tweets found')
    
    return tweets_dt


In [126]:
q = "Agriculture  -RT"
tweets_dt = get_tweets(q, 10, 1, 'tweets_250_V6').reset_index(drop=True)

Reading tweets_250_V6.bz2, created at 23/12/2022 12:20:15
DataFrame with 250 Tweets


In [127]:
tweets_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   created_at                 250 non-null    object 
 1   id                         250 non-null    int64  
 2   id_str                     250 non-null    int64  
 3   text                       250 non-null    object 
 4   truncated                  250 non-null    bool   
 5   entities                   250 non-null    object 
 6   extended_entities          4 non-null      object 
 7   metadata                   250 non-null    object 
 8   source                     250 non-null    object 
 9   in_reply_to_status_id      102 non-null    float64
 10  in_reply_to_status_id_str  102 non-null    float64
 11  in_reply_to_user_id        105 non-null    float64
 12  in_reply_to_user_id_str    105 non-null    float64
 13  in_reply_to_screen_name    105 non-null    object 

In [64]:
tweets_dt.text[0]

'I wish all Agriculture, Food &amp; Drink, Tourism and Domestic Workers across Europe a peaceful and joyful holidays. https://t.co/ZWPOEP9sHy'

In [65]:
tweets = tweets_dt.iloc[:,[0,1,3]]
tweets.head(4)

Unnamed: 0,created_at,id,text
0,Fri Dec 23 12:08:41 +0000 2022,1606260504682409985,"I wish all Agriculture, Food &amp; Drink, Tour..."
1,Fri Dec 23 12:02:35 +0000 2022,1606258966253621248,"I wish all Agriculture, Food &amp; Drink, Tour..."
2,Fri Dec 23 12:00:55 +0000 2022,1606258550464024576,Knowledge of how to make and use pottery was s...
3,Fri Dec 23 12:00:12 +0000 2022,1606258367063764993,"I wish all Agriculture, Food &amp; Drink, Tour..."


## First API

In [192]:
#token
config = dotenv_values(".env")
bearer_token = config['BEARER_TOKEN']

#connections
auth = tweepy.OAuth2BearerHandler({bearer_token})
api = tweepy.API(auth)



def bearer_oauth(r):
    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.get(url, auth=bearer_oauth, params=params)
    #print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def get_tweet_v1(query, filename, max_n):
    search_url = 'https://api.twitter.com/2/tweets/search/recent'


    file_name = f'{filename}.bz2'
    
    if os.path.exists(file_name) == False: #First checking if database exists
        print(f'Getting tweets...')
        
        # Querying the API
        json_response = connect_to_endpoint(search_url, query)
        
        tweets_dt = pd.DataFrame.from_dict(json_response['data'])
        
        
        n_token = json_response['meta']["next_token"]
        print(f'First Next Token: {n_token}')
        n = 0
        while n_token != 0 | n < max_n:
            print(f'Next Token: {n} \n {n_token}')
            query_next = query
            query_next['next_token'] = n_token
            json_response = connect_to_endpoint(search_url, query_next)
            tweets_n = pd.DataFrame.from_dict(json_response['data'])
            tweets_dt = pd.concat([tweets_dt,tweets_n], ignore_index=True)

            n += 1
            n_token = json_response['meta']["next_token"]

      
            meta = json_response['meta']
            np.save(f'{filename}.npy', meta)
            print ('file Meta Saved')
 
        tweets_dt.to_csv(file_name, index=False,compression='bz2')
        print(f'{len(tweets_dt)} Tweets found and saved')
        
    else:
        create_dt = time.strftime("%d/%m/%Y %H:%M:%S",time.strptime(time.ctime(os.path.getmtime(file_name))))
        print(f'Reading {file_name}, created at {create_dt}')
        tweets_dt = pd.read_csv(file_name)
        print(f'File with {len(tweets_dt)} Tweets')
        
    return tweets_dt


In [191]:
q = {
    #'q' : 'beef OR milk place:ea679934779f45c7',
    'query': 'agriculture Europe -is:retweet',
    'max_results': 100,
    'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
    'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
    'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
    'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
}

tweet_eu = get_tweet_v1(q, 'tweets_eu', 10)

Getting tweets...
First Next Token: b26v89c19zqg8o3fqk11ohhcjw0yomons2rx65pu7hxfh
Next Token: 0 
 b26v89c19zqg8o3fqk11ohhcjw0yomons2rx65pu7hxfh
file Meta Saved
Next Token: 1 
 b26v89c19zqg8o3fqk11ob1yv4r8vftq8oepugkdzc399
file Meta Saved
30 Tweets found and saved


In [193]:
tweet_v1

Unnamed: 0,public_metrics,created_at,edit_history_tweet_ids,reply_settings,author_id,conversation_id,id,lang,text,in_reply_to_user_id,referenced_tweets,geo
0,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2022-12-23T13:55:19.000Z,[1606287339801681920],everyone,63684604,1606287339801681920,1606287339801681920,en,Assistant Secretary (Head of Seafood and Marin...,,,
1,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2022-12-23T06:24:28.000Z,[1606173877155123201],everyone,322654210,1606173877155123201,1606173877155123201,en,"@JohnWWarnerIV: 'Aran Island, Ireland. Reminds...",1.105157518941143e+18,,
2,"{'retweet_count': 1, 'reply_count': 2, 'like_c...",2022-12-23T01:31:18.000Z,[1606100100320792578],everyone,1105157518941143047,1606100100320792578,1606100100320792578,en,"Aran Island, Ireland. Reminds me of the millio...",,,
3,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2022-12-23T00:06:29.000Z,[1606078754580844544],everyone,231244943,1605860605138796544,1606078754580844544,en,@agriculture_ie @CorkGreens So try turn Irelan...,2589328316.0,"[{'type': 'replied_to', 'id': '160586060513879...",
4,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",2022-12-22T06:00:42.000Z,[1605805508597776386],everyone,316327666,1605805508597776386,1605805508597776386,en,Any business involved in the supply chain of a...,,,
5,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2022-12-21T22:05:36.000Z,[1605685947382669312],everyone,913076671640633344,1605594637892214785,1605685947382669312,en,@christinafinn8 This thinking didn't end well ...,174844847.0,"[{'type': 'replied_to', 'id': '160559463789221...",
6,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2022-12-21T21:35:55.000Z,[1605678477235179521],everyone,116275394,1605528031132631040,1605678477235179521,en,@agriculture_ie @McConalogue @DarraghOBrienTD ...,2589328316.0,"[{'type': 'replied_to', 'id': '160552803113263...",
7,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2022-12-21T18:30:00.000Z,[1605631688700153857],everyone,162659729,1605631688700153857,1605631688700153857,en,NEWS: The Agriculture Minister has welcomed th...,,,
8,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",2022-12-21T18:00:01.000Z,[1605624144279765000],everyone,923680495405293569,1605624144279765000,1605624144279765000,en,Great to see collaboration between Ireland and...,,,
9,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",2022-12-21T12:49:29.000Z,[1605545993885192192],everyone,295365474,1604819284626010113,1605545993885192192,en,@FionaHo07708103 @ethicalfarmIE @MyLovelyHorse...,1.4682147733052498e+18,"[{'type': 'replied_to', 'id': '160554311218862...",{'place_id': '583d04d90e3931f3'}


In [168]:
# Load Meta
meta = np.load('tweets_v4.npy',allow_pickle='TRUE').item()
meta#['next_token']

{'newest_id': '1606287339801681920',
 'oldest_id': '1603799682030911488',
 'result_count': 53}

# Clean Your Text Data

## Worlds extractions

In [66]:
def clean_tweet(x, link, keyword, usernames):
    list_of_lists =[]
    if link == True:
        list_of_links = []
        words = x.split(' ')
        for word in words:
            if re.search('http', word):
                list_of_links.append(re.split("\W+",word.lower()))
        if len(list_of_links) > 0:
            list_of_lists.append(list_of_links[0])
    
    if keyword == True:
        list_of_keywords = []
        words = x.split()
        for word in words:
            if word.startswith('#'):
                list_of_keywords.append(word)
        if len(list_of_keywords) > 0:
            list_of_lists.append(list_of_keywords)
            
    if usernames == True:
        list_of_usernames = []
        words = x.split()
        for word in words:
            if word.startswith('@'):
                list_of_usernames.append(word.lower().replace('@',''))
        if len(list_of_usernames) > 0:
            list_of_lists.append(list_of_usernames)
    
    return  [item for sublist in list_of_lists for item in sublist]

        
# keyword extraction from tweets
def get_keywords(x):
    list_of_keywords = []
    words = x.split()
    for word in words:
        if word.startswith('#'):
            list_of_keywords.append(word)
    return list_of_keywords


In [67]:
list_of_lists = tweets['text'].apply(lambda tweet : clean_tweet(tweet, link = True, keyword = False, usernames = True))
rem_list = [item for sublist in list_of_lists for item in sublist]

tweets['text_c'] = tweets['text'].apply( lambda tweet : ' '.join([word for word in re.split("\W+",tweet) if word.lower() not in rem_list]))

In [68]:
tweets['keywords'] = tweets['text'].apply( lambda tweet : get_keywords(tweet) )

In [103]:
tweets.sample(2)

Unnamed: 0,created_at,id,text,text_c,keywords,text_ps,TextBlob,Vader,TextBlob_desc,Vader_desc,is_sarcastic
126,Sun Dec 18 01:37:16 +0000 2022,1604289660687204352,@FoodProfessor Tired of people who have no con...,Tired of people who have no connection to agri...,[],tire peopl connect agricultur answer farmer st...,0.0,0.0,neutral,neutral,0
225,Wed Dec 14 20:22:18 +0000 2022,1603123234790027266,@AgrilandIreland #AllForShow given GDPR and 20...,AllForShow given GDPR and 2016 1012 hasn even ...,[#AllForShow],allforshow given gdpr even appli yet member st...,0.0,0.0,neutral,neutral,0


In [70]:
n = 1
print('OLD: ', tweets['text'][n], '\n')
print('NEW: ', tweets['text_c'][n])

OLD:  I wish all Agriculture, Food &amp; Drink, Tourism and Domestic Workers across Europe a peaceful and joyful holidays. https://t.co/jnxr2I4J4b 

NEW:  I wish all Agriculture Food amp Drink Tourism and Domestic Workers across Europe a peaceful and joyful holidays


## PoterStemmer

In [71]:
# Store the stopwords into the object named as "stop_words"
stop_words = stopwords.words('english')

# Store the string.punctuation into an object punct
punct = string.punctuation

# Initialise an object using a method PorterStemmer
stemmer = PorterStemmer()

In [72]:
def stremming(df, text_col, name_new_col):
    # Store the column of the dataframe named as "text"
    X = df[text_col]
    cleaned_data=[]
    # For loop from first value to length(X), ^a-zA-Z means include small and capital case letters
    for i in range(len(X)):
        text = re.sub('[^a-zA-Z]', ' ', X.iloc[i])
        text = text.lower().split()
        text = [stemmer.stem(word) for word in text if (word not in stop_words) and (word not in punct)]
        text = ' '.join(text)
        df.loc[ i ,name_new_col] = text
    print('Stremmer done!')

In [73]:
stremming(tweets, 'text_c', 'text_ps')

Stremmer done!


In [74]:
print(tweets['text'][0], '\n')
print(tweets['text_ps'][0], '\n')

I wish all Agriculture, Food &amp; Drink, Tourism and Domestic Workers across Europe a peaceful and joyful holidays. https://t.co/ZWPOEP9sHy 

wish agricultur food amp drink tourism domest worker across europ peac joy holiday 



# Sentiment Analyzes

TextBlob is a Python (2 and 3) library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. [link](https://textblob.readthedocs.io/en/dev/index.html)

In [75]:
for i in tweets.index:
    text = tweets.loc[i,'text_ps']
    tweets.loc[i, 'TextBlob'] = TextBlob(text).sentiment.polarity
    #print(TextBlob(text).sentiment.polarity)
    tweets.loc[i, 'Vader'] = SentimentIntensityAnalyzer().polarity_scores(text)['compound']
    #print(SentimentIntensityAnalyzer().polarity_scores(text)['compound'], '\n')
    
tweets.sample(2)

Unnamed: 0,created_at,id,text,text_c,keywords,text_ps,TextBlob,Vader
109,Mon Dec 19 06:40:07 +0000 2022,1604728265100386305,@owled_ @TwoProngedSword @restoreorderusa euro...,europe wouldn have advanced out of medieval ti...,[],europ advanc mediev time,0.0,0.0
86,Tue Dec 20 01:26:45 +0000 2022,1605011792341647360,@ThePeoplesHubUK @GoogleExpertUK @TheRemarkabl...,TheTechieGirls,[],thetechiegirl,0.0,0.0


In [76]:
def get_polarity(df, col):

    polarity = 0
    neutral = 0
    wpositive = 0
    positive = 0
    spositive = 0
    wnegative = 0
    negative = 0
    snegative = 0
     
    for t in df.index:
        
        v = df.loc[t, col]
        polarity += v  # adding up polarities to find the average later

        if (v == 0):  # adding reaction of how people are reacting to find average later
            neutral += 1
            desc = 'neutral'
        elif (v > 0 and v <= 0.3):
            wpositive += 1
            desc ='weak_positive'
        elif (v > 0.3 and v <= 0.6):
            positive += 1
            desc = 'positive'
        elif (v > 0.6 and v <= 1):
            spositive += 1
            desc = 'strong_positive'
        elif (v > -0.3 and v <= 0):
            wnegative += 1
            desc = 'weak_negative'
        elif (v > -0.6 and v <= -0.3):
            negative += 1
            desc = 'negative'
        elif (v > -1 and v <= -0.6):
            snegative += 1
            desc = 'strong_negative'
         
        df.loc[t, f'{col}_desc'] = desc
        

    return {'polarity_sum':polarity,
            'polarity_mean':(polarity / len(df)),
            'neutral':neutral,
            'strong_positive':spositive,
            'positive':positive,
            'weak_positive':wpositive,
            'weak_negative':wnegative,
            'negative':negative,
            'strong_negative':snegative}




In [77]:
get_polarity(tweets,'TextBlob')

{'polarity_sum': 17.242997835497835,
 'polarity_mean': 0.06897199134199133,
 'neutral': 158,
 'strong_positive': 12,
 'positive': 23,
 'weak_positive': 29,
 'weak_negative': 24,
 'negative': 3,
 'strong_negative': 1}

In [78]:
get_polarity(tweets,'Vader')

{'polarity_sum': 23.771100000000004,
 'polarity_mean': 0.09508440000000001,
 'neutral': 130,
 'strong_positive': 20,
 'positive': 48,
 'weak_positive': 19,
 'weak_negative': 12,
 'negative': 14,
 'strong_negative': 7}

In [80]:
tweets.sample()

Unnamed: 0,created_at,id,text,text_c,keywords,text_ps,TextBlob,Vader,TextBlob_desc,Vader_desc
27,Thu Dec 22 13:05:48 +0000 2022,1605912489669050371,"‘Enjoy, it’s from Europe’: Brussels announces ...",Enjoy it s from Europe Brussels announces fres...,[],enjoy europ brussel announc fresh agri food pr...,0.35,0.6705,positive,strong_positive


In [81]:
##Checking Twitters

n = 3
print('Text Original:', tweets.loc[n, 'text'], '\n',
     '-------------------------------------------------------------------------------------------')
print('Text Clear:', tweets.loc[n, 'text_c'], '\n',
     '-------------------------------------------------------------------------------------------')

print('Text Steammed:', tweets.loc[n, 'text_ps'], '\n',
     '-------------------------------------------------------------------------------------------')

print('KeyWords:', tweets.loc[n, 'keywords'], '\n',
     '-------------------------------------------------------------------------------------------')

print('TextBlob: ',tweets.loc[n, 'TextBlob'], tweets.loc[n, 'TextBlob_desc'])
print('Vader: ', tweets.loc[n, 'Vader'], tweets.loc[n, 'Vader_desc'])

Text Original: I wish all Agriculture, Food &amp; Drink, Tourism and Domestic Workers across Europe a peaceful and joyful holidays. https://t.co/EHGS8vi35D 
 -------------------------------------------------------------------------------------------
Text Clear: I wish all Agriculture Food amp Drink Tourism and Domestic Workers across Europe a peaceful and joyful holidays 
 -------------------------------------------------------------------------------------------
Text Steammed: wish agricultur food amp drink tourism domest worker across europ peac joy holiday 
 -------------------------------------------------------------------------------------------
KeyWords: [] 
 -------------------------------------------------------------------------------------------
TextBlob:  0.8 strong_positive
Vader:  0.8481 strong_positive


In [82]:
tweets.iloc[:, -6:]

Unnamed: 0,keywords,text_ps,TextBlob,Vader,TextBlob_desc,Vader_desc
0,[],wish agricultur food amp drink tourism domest ...,0.800000,0.8481,strong_positive,strong_positive
1,[],wish agricultur food amp drink tourism domest ...,0.800000,0.8481,strong_positive,strong_positive
2,[],knowledg make use potteri share hunter gather ...,0.000000,0.2960,neutral,weak_positive
3,[],wish agricultur food amp drink tourism domest ...,0.800000,0.8481,strong_positive,strong_positive
4,[],wish agricultur food amp drink tourism domest ...,0.800000,0.8481,strong_positive,strong_positive
...,...,...,...,...,...,...
245,[],hold third largest economi europ boast econom ...,0.000000,0.3818,neutral,positive
246,"[#France, #FAO, #Europe]",associ project offic food agricultur europ,0.000000,0.0000,neutral,neutral
247,[#TheIrishCivilWar],ireland time minist agricultur taoisigh fact t...,0.000000,0.0000,neutral,neutral
248,[],quit sure variou tribe quit sophist pygmi live...,0.318182,0.3182,positive,positive


## Sarcasm

In [83]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [84]:
news = pd.read_csv('sarcasm_headlines.bz2')
news.sample()

Unnamed: 0,headline,is_sarcastic
12895,funniest parenting tweets: what moms and dads ...,0


In [85]:
print(news.isnull().any(axis = 0))

headline        False
is_sarcastic    False
dtype: bool


In [86]:
stremming(news, 'headline', 'headline_ps')

Stremmer done!


In [87]:
news.sample(2)

Unnamed: 0,headline,is_sarcastic,headline_ps
1769,gunfire erupts in ferguson after protester is ...,0,gunfir erupt ferguson protest struck car
24840,these are the only 5 shoes you need in your cl...,0,shoe need closet fall


In [88]:
def create_features(df, col, max_feat):
    features = df[col]

    # vectorizing the data with maximum features
    tv = TfidfVectorizer(max_features = max_feat)
    features = list(features)
    features = tv.fit_transform(features).toarray()
    
    return features

In [92]:
features = create_features(news, 'headline_ps', 1000)
labels = news['is_sarcastic']

In [93]:
# getting training and testing data
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = .05, random_state = 0)

In [94]:
print('\nLinear Support Vector Classifier:')
lsvc = LinearSVC()
lsvc.fit(features_train, labels_train)
print('Train: ',lsvc.score(features_train, labels_train))
print('Test: ',lsvc.score(features_test, labels_test))


print('\nGaussuan Naive Bayes:')
gnb = GaussianNB()
gnb.fit(features_train, labels_train)
print('Train: ',gnb.score(features_train, labels_train))
print('Test: ',gnb.score(features_test, labels_test))


print('\nLogistic Regression:')
lr = LogisticRegression()
lr.fit(features_train, labels_train)
print('Train: ',lr.score(features_train, labels_train))
print('Test: ',lr.score(features_test, labels_test))


print('\nRandom Forest Classifier:')
rfc = RandomForestClassifier(n_estimators = 10, random_state = 0)
rfc.fit(features_train, labels_train)
print('Train: ',rfc.score(features_train, labels_train))
print('Test: ',rfc.score(features_test, labels_test))


Linear Support Vector Classifier:
Train:  0.7708587868994601
Test:  0.7455089820359282

Gaussuan Naive Bayes:
Train:  0.7110314113427659
Test:  0.6931137724550899

Logistic Regression:
Train:  0.7681787727111496
Test:  0.7455089820359282

Random Forest Classifier:
Train:  0.9592874315216963
Test:  0.7163173652694611


### Prev

In [97]:
prev = create_features(tweets, 'text_ps', 1000)

In [98]:
tweets['is_sarcastic'] = lsvc.predict(prev)

In [99]:
tweets.sample(4)

Unnamed: 0,created_at,id,text,text_c,keywords,text_ps,TextBlob,Vader,TextBlob_desc,Vader_desc,is_sarcastic
18,Thu Dec 22 17:30:01 +0000 2022,1605978981928419329,"For decades, researchers believed pottery arri...",For decades researchers believed pottery arriv...,[],decad research believ potteri arriv europ agri...,0.0,0.0,neutral,neutral,1
64,Tue Dec 20 17:53:59 +0000 2022,1605260237379870730,Excellent discussion on the future of #Abraham...,Excellent discussion on future of AbrahamAccor...,[#AbrahamAccords],excel discuss futur abrahamaccord cooper agric...,0.0,0.4588,neutral,positive,1
174,Thu Dec 15 21:32:33 +0000 2022,1603503302917890064,A recent studie on climate and trees suggest H...,A recent studie on climate and trees suggest H...,[],recent studi climat tree suggest hunnic peopl ...,0.0,0.0,neutral,neutral,0
124,Sun Dec 18 09:09:52 +0000 2022,1604403562515435520,‘Sensible centrism’ = racist eugenics.\n\nThe ...,Sensible centrism racist eugenics cheddar man ...,[],sensibl centrism racist eugen cheddar man sout...,0.0,-0.6124,neutral,strong_negative,0


In [102]:
##Checking Twitters

n = 78
print('Text Original:', tweets.loc[n, 'text'], '\n',
     '-------------------------------------------------------------------------------------------')
print('Text Clear:', tweets.loc[n, 'text_c'], '\n',
     '-------------------------------------------------------------------------------------------')

print('Text Steammed:', tweets.loc[n, 'text_ps'], '\n',
     '-------------------------------------------------------------------------------------------')

print('KeyWords:', tweets.loc[n, 'keywords'], '\n',
     '-------------------------------------------------------------------------------------------')

print('TextBlob: ',tweets.loc[n, 'TextBlob'], tweets.loc[n, 'TextBlob_desc'])
print('Vader: ', tweets.loc[n, 'Vader'], tweets.loc[n, 'Vader_desc'])
print('Sarcasm: ', tweets.loc[n, 'is_sarcastic'])


Text Original: The number of sheep slaughtered during the period January to November 2022 increased by 7.5%… https://t.co/o722t07Eo6 
 -------------------------------------------------------------------------------------------
Text Clear: number of sheep slaughtered during period January to November 2022 increased by 7 5 
 -------------------------------------------------------------------------------------------
Text Steammed: number sheep slaughter period januari novemb increas 
 -------------------------------------------------------------------------------------------
KeyWords: [] 
 -------------------------------------------------------------------------------------------
TextBlob:  0.0 neutral
Vader:  0.0772 weak_positive
Sarcasm:  1
