# Analyzing possible Vulnerabilities/Cyber Attacks using Tweets

## Installing the required Libraries and Twint to get Tweets from Twitter API.

### Required Libraries

#### Twint, Optimuspyspark, aiohttp, aiodns, beautifulsoup4, 
#### chardet, Elasticsearch pysocks Pandas, aiohttp_socks
#### schedule, geopy, nest_asyncio, TextBlob, WordCloud, 
#### Matplotlib, Seaborn, Pyspark
 

In [None]:
!pip install --user -r requirements.txt

### The Twint Repository: 

Download the twint repository from here:

https://github.com/twintproject/twint and manually install it as shown in the below cell.

or else directly install it using this code:                                                                                                
!pip install --upgrade --user -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint

In [None]:
!pip install C:\Users\Fardeenxbaig\Desktop\twitter_optimus_twint-master\twitter_optimus_twint-master\twint

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("twint/")

### Importing Twint

In [None]:
import twint

In [None]:
from optimus import Optimus
op = Optimus(verbose=True)

## Setting up Twint Configuration

In [None]:
# Set up TWINT config
c = twint.Config()

In [None]:
# Solve compatibility issues with notebooks and RunTime errors.
import nest_asyncio
nest_asyncio.apply()

# Search for the appropriate Cyber Security related tweets

In [None]:
c.Search = "spyware"
# Custom output format
c.Format = "Tweet: {tweet}"
c.Limit = 350
c.Pandas = True

twint.run.Search(c)

## Saving the scraped results into Pandas

In [None]:
def available_columns():
    return twint.output.panda.Tweets_df.columns

In [None]:
def twint_to_pandas(columns):
    return twint.output.panda.Tweets_df[columns]

In [None]:
available_columns()

In [None]:
df_pd = twint_to_pandas(["date", "username", "tweet", "hashtags", "nlikes"])

### Data stored in Pandas Dataframe

In [None]:
df_pd

# Sentiment Analysis of the Tweets

## Saving the data to Optimus

In [None]:
from pyspark.sql import SparkSession
from optimus import Optimus

spark = SparkSession.builder.appName('optimus').getOrCreate()
op= Optimus(spark)

#### Import Pandas

In [None]:
import pandas as pd

spark.conf.set("spark.sql.execution.arrow.enabled", "true")
pdf = pd.DataFrame(df_pd)
df = spark.createDataFrame(pdf)

## Cleaning the tweets

In [None]:
clean_tweets = df.cols.remove_accents("tweet") \
                 .cols.remove_special_chars("tweet")

In [None]:
clean_tweets.count()

In [None]:
sc = spark.sparkContext

In [None]:
import re

In [None]:
tweets = clean_tweets.select("tweet").rdd.flatMap(lambda x: x).collect()


### Cleaned Tweets

In [None]:
tweets

### Analyze sentiment of tweets

In [None]:
from textblob import TextBlob
from IPython.display import Markdown, display

In [None]:
def printmd(string, color=None):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))

### This will give results after Sentiment Analysis through TextBlob

In [None]:
for tweet in tweets:
    print(tweet)
    analysis = TextBlob(tweet)
    print(analysis.sentiment)
    if analysis.sentiment[0]>0:
        printmd('Positive', color="green")
    elif analysis.sentiment[0]<0:
        printmd('Negative', color="red")
    else:
        printmd("Neutral", color="grey")
        print("")

### Transforming this code to Spark code as it won’t scale, because, we are collecting the data from Spark so the driver’s RAM is the limit.

## Adding sentiments directly to the dataframe

In [None]:
from pyspark.sql.functions import udf

In [None]:
def apply_blob(sentence):
    temp = TextBlob(sentence).sentiment[0]
    if temp == 0.0:
        return 0.0 # Neutral
    elif temp >= 0.0:
        return 1.0 # Positive
    else:
        return 2.0 # Negative

In [None]:
sentiment = udf(apply_blob)

In [None]:
resutls = clean_tweets.withColumn("sentiment", sentiment(clean_tweets['tweet'])).show()

# Making the code Modular

### Setting up the Configuration 

In [None]:
%load_ext autoreload
%autoreload 2

# Import twint
import sys
sys.path.append("twint/")

# Set up TWINT config
import twint
c = twint.Config()

# Other imports
import seaborn as sns
import os
from optimus import Optimus
op = Optimus()

# Solve compatibility issues with notebooks and RunTime errors.
import nest_asyncio
nest_asyncio.apply()

# Disable annoying printing

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

### All the code snippets from above are modularised as Functions here

In [None]:
from textblob import TextBlob
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

# Function to get sentiment
def apply_blob(sentence):
    temp = TextBlob(sentence).sentiment[0]
    if temp == 0.0:
        return 0.0 # Neutral
    elif temp >= 0.0:
        return 1.0 # Positive
    else:
        return 2.0 # Negative

# UDF to write sentiment on DF
sentiment = udf(apply_blob, DoubleType())

# Transform result to pandas
def twint_to_pandas(columns):
    return twint.output.panda.Tweets_df[columns]

def tweets_sentiment(search, limit=1):
    c.Search = search
    # Custom output format
    c.Format = "Username: {username} |  Tweet: {tweet}"
    c.Limit = limit
    c.Pandas = True
   # with HiddenPrints():
    print(twint.run.Search(c))

    # Transform tweets to pandas DF
    df_pd = twint_to_pandas(["date", "username", "tweet", "hashtags", "nlikes"])

    # Transform Pandas DF to Optimus/Spark DF
    spark.conf.set("spark.sql.execution.arrow.enabled", "true")
    pdf = pd.DataFrame(df_pd)
    df = spark.createDataFrame(pdf)

    # Clean tweets
    clean_tweets = df.cols.remove_accents("tweet") \
                 .cols.remove_special_chars("tweet")

    # Add sentiment to final DF
    return clean_tweets.withColumn("sentiment",    sentiment(clean_tweets['tweet']))

### Using this to get the Tweets and add Sentiment

In [None]:
df_result = tweets_sentiment("spyware", limit=350)

### Results with added Sentiment in Dataframe

In [None]:
df_result.show()

## Distribution of Sentiments

In [None]:
df_pd_result = df_result.toPandas()

## Saving the cleaned Tweets with Sentiment into CSV for any post analysis if needed

In [None]:
df_pd_result.to_csv('processedtweets.csv')

In [None]:
sns.distplot(df_pd_result['sentiment'])
sns.set(rc={'figure.figsize':(11.7,8.27)})

## Getting Tweets easily to build a WordCloud

In [None]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append("twint/")

import twint
import emoji
import numpy as np
from PIL import Image
from os import path

# Solve compatibility issues with notebooks and RunTime errors.
import nest_asyncio
nest_asyncio.apply()

In [None]:
# Disable annoying printing

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

## Get tweets easily with a simple search

In [None]:
def get_tweets(search, limit=100):
    c = twint.Config()
    c.Search = search
    c.Limit = limit
    c.Pandas = True
    c.Pandas_clean = True

    #with HiddenPrints():
    print(twint.run.Search(c))
    return twint.output.panda.Tweets_df[["username","tweet"]]

In [None]:
tweets = get_tweets("spyware", limit=500)

In [None]:
tweets

In [None]:
tweets.count()

## Generating the word cloud to see trending possible vulnerabilities

### Refining further to remove unknown characters from Tweets

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline

text = tweets.tweet.values

# adding movie script specific stopwords
stopwords = set(STOPWORDS)
stopwords.add("https")
stopwords.add("http")
stopwords.add("xa0")
stopwords.add("xa0'")
stopwords.add("bitly")
stopwords.add("bit")
stopwords.add("ly")
stopwords.add("twitter")
stopwords.add("pic")

wordcloud = WordCloud(
    background_color = 'black',
    width = 1000,
    height = 500,
    stopwords = stopwords).generate(str(text))

### The WordCloud Generated

In [None]:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.rcParams['figure.figsize'] = [10,25]

# [Mirza Fardeen Baig](https://github.com/fardeenxbaig)