In [None]:
# import the dataset from sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# import other required libs
import pandas as pd
import numpy as np

# string manipulation libs
import re
import string
import nltk
from nltk.corpus import stopwords

# viz libs
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv("....csv")


In [None]:
df

In [None]:
import nltk #data pre-processing
from nltk.corpus import stopwords
# nltk.download('stopwords')

stopwords.words("english")[:500] # <-- import the english stopwords

In [None]:
new_words = ('yeah', 'okay')

for i in new_words:
    stopwords.words('english').append(i)



In [None]:
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    """This utility function sanitizes a string by:
    - removing links
    - removing special characters
    - removing numbers
    - removing stopwords
    - transforming in lowercase
    - removing excessive whitespaces
    Args:
        text (str): the input text you want to clean
        remove_stopwords (bool): whether or not to remove stopwords
    Returns:
        str: the cleaned text
    """

    # remove links
    text = re.sub(r"http\S+", "", text)
    # remove special chars and numbers
    text = re.sub("[^A-Za-z]+", " ", text)
    # remove stopwords
    if remove_stopwords:
        # 1. tokenize
        tokens = nltk.word_tokenize(text)
        # 2. check if stopword
        tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
        # 3. join back together
        text = " ".join(tokens)
    # return text in lower case and stripped of whitespaces
    text = text.lower().strip()
    return text

In [None]:
for doc in df['body']:
    re.sub("[^a-zA-Z]", " ",str(df['body']))


In [None]:
df.dropna()

In [None]:
df['cleaned'] = df['body'].apply(lambda x: preprocess_text(x, remove_stopwords=True))


In [None]:
df

In [None]:
# initialize the vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95)
# fit_transform applies TF-IDF to clean texts - we save the array of vectors in X
X = vectorizer.fit_transform(df['cleaned'])

In [None]:
X.toarray()

In [None]:
Sum_of_squared_distances = [] #elbow method for optimal clusters
K = range(1,10)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(X)
    Sum_of_squared_distances.append(km.inertia_)
plt.plot(range(1, 10), Sum_of_squared_distances)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of Squared Differences')
plt.show()

In [None]:
from sklearn.cluster import KMeans

# initialize kmeans with 3 centroids
kmeans = KMeans(n_clusters=4, random_state=42)
# fit the model
kmeans.fit(X)
# store cluster labels in a variable
clusters = kmeans.labels_

In [None]:
[c for c in clusters][:10]

In [None]:
from sklearn.decomposition import PCA

# initialize PCA with 2 components
pca = PCA(n_components=2, random_state=42)
# pass our X to the pca and store the reduced vectors into pca_vecs
pca_vecs = pca.fit_transform(X.toarray())
# save our two dimensions into x0 and x1
x0 = pca_vecs[:, 0]
x1 = pca_vecs[:, 1]

In [None]:
x0

In [None]:
x1

In [None]:
df['cluster'] = clusters
df['x0'] = x0
df['x1'] = x1

In [None]:
def get_top_keywords(n_terms):
    df = pd.DataFrame(X.todense()).groupby(clusters).mean() # groups the TF-IDF vector by cluster
    terms = vectorizer.get_feature_names() # access tf-idf terms
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) # for each row of the dataframe, find the n terms that have the highest tf idf score
            

In [None]:
get_top_keywords(100)

In [None]:

# set image size
plt.figure(figsize=(12, 7))
# set a title
plt.title("Insert Title Here", fontdict={"fontsize": 18})
# set axes names
plt.xlabel("X0", fontdict={"fontsize": 16})
plt.ylabel("X1", fontdict={"fontsize": 16})
# create scatter plot with seaborn, where hue is the class used to group the data
sns.scatterplot(data=df, x='x0', y='x1', hue='cluster', palette="Set2")
plt.show()

In [None]:
cluster_map = {0: "Sarcasm/Humor", 1: "Dry January Resources", 2: "Perrier Ad", 3: "Dry January Support", 4: "Unclear/General", 5: "Perrier Ad", 6: "Dry January Health Benefits", 7: "Encouragement"}
# apply mapping
df['cluster'] = df['cluster'].map(cluster_map)

In [None]:
df['cluster'].unique()

In [None]:
df = pd.get_dummies(df, columns=['cluster'], drop_first=False)


In [None]:
df

In [None]:
####################VADER########################

In [None]:
import nltk
import os
import pandas as pd
import numpy as np
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
df['scores'] = df['text'].apply(lambda review:sid.polarity_scores(review))

In [None]:
df['compound'] = df['scores'].apply(lambda d:d['compound'])

In [None]:
df

In [None]:
df.to_csv(".....csv")

In [None]:
#########################Botometer############################

In [None]:
#########################Cross-Validation######################

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

In [None]:
df = df[['body', 'cluster']]

In [None]:
df.cluster.value_counts()

In [None]:
countvec = CountVectorizer(ngram_range=(1,4), 
                           stop_words='english',  
                           strip_accents='unicode', 
                           max_features=1000)

In [None]:
bow = countvec.fit_transform(df.tweet)
mnb = MultinomialNB()
cv_scores = cross_val_score(mnb,X=bow.toarray(), 
                            y=df.cluster.values, cv=5)
mean_cv = cv_scores.mean()


In [None]:
print('CV scores: {}'.format(cv_scores))
print('Mean Cross validated accuracy: {}'.format(round(mean_cv, 2)))

In [None]:
######################Account Descriptions#######################

In [None]:
df['4'].describe()

In [None]:
df['4'].value_counts()

In [None]:
#####################Botometer#######################

In [None]:
from botometer import Botometer

In [None]:
botometer_api_url = "https://botometer-pro.p.rapidapi.com"
rapidapi_key = '...........'

In [None]:
bom = Botometer(wait_on_ratelimit=True, botometer_api_url=botometer_api_url, rapidapi_key = rapidapi_key, **twitter_app_auth)

In [None]:
output = open('D://MRNA_Bot_7_result.csv', 'w+')    

In [None]:
for username in df['username'].values:
    try:
        api_data = bom.check_account(username)
        csvfile = csv.writer(output)
        csvfile.writerow([api_data['cap']['english'],api_data['cap']['universal']])
    except tweepy.error.TweepError as e:
        print(e.reason)
        continue 

output.close()