![Screen%20Shot%202021-06-19%20at%2010.06.07%20AM.png](attachment:Screen%20Shot%202021-06-19%20at%2010.06.07%20AM.png)

# Using Twitter API & and Predicting

In [13]:
# importing necessary libraries and packages
import pandas as pd
import numpy as np

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import svm

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score, confusion_matrix, precision_recall_curve, plot_confusion_matrix, auc

import pickle

import twint

import nest_asyncio

import requests
import oauthlib

from twython import Twython

In [23]:
# regular expression to identify non-ascii characters in content
non_ascii_regex = r'[^\x00-\x7F]+'

# defining stop words
stopword_list = stopwords.words('english')
stopword_list += list(string.punctuation)

# function to remove special characters, tokenize, and stem content
def process(content):
    
    # using library re to replace non ascii characters by a space
    text = re.sub(non_ascii_regex, ' ', content)

    # instantiating TweetTokenizer
    tk = TweetTokenizer(strip_handles=True)
    # tokenizing the content & removing usernames
    tokens = tk.tokenize(content)
    
    # instantiating stemmer
    ps = PorterStemmer()

    # stemming the tokens and removing the stopwords
    clean_tokens = []
    for token in tokens:
        if token not in stopword_list:
            try:
                clean_tokens.append(ps.stem(token.lower()))
            except:
                pass
        
    # return the tokens
    return clean_tokens

In [24]:
# Loading pretrained logistic regression model
with open('pickles/bestlr_pipeline.pkl', 'rb') as file:
    model = pickle.load(file)

In [25]:
with open('pickles/tree_pipeline.pkl', 'rb') as file:
    tree = pickle.load(file)

In [7]:
model.predict(['suck', 'hate', 'moron', 'happy'])

array([1., 1., 1., 0.])

In [8]:
model.predict(['fat', 'skinny', 'love', 'balls'])

array([1., 0., 0., 1.])

In [9]:
model.predict(['i hate you'])

array([1.])

In [55]:
model.predict(['i love you'])

array([0.])

## Getting New Data from Twitter's API

I utilized Twython and my developer access to Twitter's API in order to serach for some new data to use to test out the model!

In [8]:
# setting up credentials
APP_KEY = [key_hidden_for_security]
APP_SECRET = key_hidden_for_security]

twitter = Twython(APP_KEY, APP_SECRET)

auth = twitter.get_authentication_tokens()

In [9]:
OAUTH_TOKEN = auth['oauth_token']
OAUTH_TOKEN_SECRET = auth['oauth_token_secret']

In [10]:
auth['auth_url']

'https://api.twitter.com/oauth/authenticate?oauth_token=IT7khgAAAAABPrh4AAABeiDhlRo'

In [14]:
OAUTH_TOKEN = [key_hidden_for_security]
OAUTH_TOKEN_SECRET = [key_hidden_for_security]

In [15]:
twitter = Twython(APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)

In [40]:
# creating Twython instance
twitter = Twython(APP_KEY, APP_SECRET, OAUTH_TOKEN,
    OAUTH_TOKEN_SECRET)

# searching for tweets containing 'hate'
results = twitter.cursor(twitter.search, q='hate')
hate_tweets = []
# appending search results to list
for result in results:
    hate_tweets.append(result)

TwythonRateLimitError: Twitter API returned a 429 (Too Many Requests), Rate limit exceeded

In [114]:
# see how many tweets we got before rate limit exceeded
len(hate_tweets)

2690

In [115]:
def predict(text):
    prediction = model.predict(text)
    if prediction == 0:
        print("not cyberbullying")
    if prediction == 1:
        print("cyberbullying")

In [116]:
test1 = [hate_tweets[90]['text']]

In [117]:
# checking out a tweet
test1

['@notbitterbetter I’m not. I fuckin hate them 😂']

In [118]:
# using the model to make a prediction
predict(test1)

cyberbullying


The model classifies this as cyberbullying, due to certain common cyberbullying words contained in the tweet.

In [60]:
test2 = process(hate_tweets[1]['text'])

In [119]:
test2 = [hate_tweets[1]['text']]
test2

["RT @CutieStraw: If you have a raging hate boner for someone that is so intense that you go so far as to mock their newborn child's name, CO…"]

In [120]:
predict(test2)

cyberbullying


Again, the model classifies this as cyberbullying, due to the words contained in the tweet.

In [121]:
test3 = [hate_tweets[11]['text']]
test3

['not to be dramatic or anything but my birthday is tomorrow and if you don’t tell me happy birthday i’ll literally hate you forever']

In [122]:
predict(test3)

not cyberbullying


The model classifies this tweet as non-cyberbullying.

In [71]:
twitter = Twython(APP_KEY, APP_SECRET, OAUTH_TOKEN,
    OAUTH_TOKEN_SECRET)

# searching for tweets containing 'love'
results = twitter.cursor(twitter.search, q='love')
love_tweets = []
# appending search results to list
for result in results:
    love_tweets.append(result)

TwythonRateLimitError: Twitter API returned a 429 (Too Many Requests), Rate limit exceeded

In [123]:
# see how many tweets we got before rate limit exceeded
len(love_tweets)

2681

In [124]:
# checking out a tweet
test4 = [love_tweets[11]['text']]
test4

['I believe this is a faithful project.The projector has a lot of attractions so hopefully the project will be better… https://t.co/QcBIJC9SlC']

In [125]:
# using the model to make a prediction
predict(test4)

not cyberbullying


In [126]:
test5 = [love_tweets[40]['text']]
test5

['@vornietom love this']

In [127]:
predict(test5)

not cyberbullying


In [128]:
test6 = [love_tweets[70]['text']]
test6

['@WorldsBedford @BattleBeaverC Share some of that love 😍']

In [129]:
predict(test6)

not cyberbullying


The model classified the love_tweets as non-cyberbullying!

In [130]:
# further model demonstration
nice = ['you are so beautiful']
predict(nice)

not cyberbullying


In [131]:
mean = ['you are so ugly']
predict(mean)

cyberbullying


In [157]:
user_input = ['you are the best']

In [158]:
predict(user_input)

not cyberbullying
