<a href="https://colab.research.google.com/github/conceptbin/dipstick/blob/master/Dipstick_0_2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#User input
1. Enter search term and the limit (max. no. of tweets) in this section.
2. Mount Google Drive for this notebook.
3. Run all cells (Runtime > Run all).

In [0]:
search_term = "Roehampton" #Enter search term here.
limit = 5000 #If you're just testing the search, set a smaller limit.

Below this cell, no editing is needed if you're just running the notebook as normal.

#Search and save to pandas dataframe

In [0]:
#Install Twint for Twitter search
!pip3 install twint

In [0]:
import pandas as pd
import twint

c = twint.Config()
c.Search = search_term
c.Limit = limit
#c.Min_likes = 5 #Minimum number of likes, to just get tweets people interacted with.
c.Pandas = True

twint.run.Search(c)
df = twint.storage.panda.Tweets_df

#Basic Analysis

##User by frequency

In [0]:
most_tweets = df.groupby(['username']).size().reset_index(name='counts')
most_tweets = most_tweets.sort_values(by='counts', ascending=False)

##Likes

In [0]:
#Most likes
most_l = (df.nlargest(1000, 'nlikes') 
          .drop_duplicates(['tweet'])
          )
most_l = most_l.sort_values(by='nlikes', ascending=False)[:100]
#most_l[:10][['date','username','tweet','nlikes']]  #Show most liked

##Retweets

In [0]:
#Most retweeted
most_r = (df.nlargest(1000, 'nretweets') 
          .drop_duplicates(['tweet'])
          )
most_r = most_r.sort_values(by='nretweets', ascending=False)[:100]
#most_r[:10][['date','username','tweet','nretweets']]  #Uncomment this line to show most retweeted

##Overview

In [0]:
#Overview data
tweets_total = len(df)  #Total no. of tweets in the set
tweeters = len(df['username'].unique())  #No. of unique tweeters
median_likes = df['nlikes'].median() #Median number of likes
median_retweets = df['nretweets'].median()  #Median number of retweets

In [0]:
#Gather overview data into a dict
report = {'What': ['Total no. of tweets in the sample', 'No. of unique tweeters', 'Median likes','Median retweets'],
          'Number': [tweets_total, tweeters, median_likes, median_retweets]}

In [0]:
#Make overview dataframe from dict and display
report_table = pd.DataFrame(report)
#report_table

#Keywords, hashtags, etc.

##N-grams
Code adapted from De Dios, From Dataframe to N-Grams (Medium 22 May 2020) [link text](https://towardsdatascience.com/from-dataframe-to-n-grams-e34e29df3460).

In [0]:
#Import libraries 

# natural language processing: n-gram ranking
import re
import unicodedata
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
# add appropriate words that will be ignored in the analysis
#ADDITIONAL_STOPWORDS = ['covfefe']

import matplotlib.pyplot as plt

In [0]:
#Function for basic cleaning of the text.
def basic_clean(text):
  """
  A simple function to clean up the data. All the words that
  are not designated as a stop word are lemmatized after
  encoding and basic regex parsing are performed.
  """
  wnl = nltk.stem.WordNetLemmatizer()
  stopwords = nltk.corpus.stopwords.words('english') # + ADDITIONAL_STOPWORDS
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [0]:
#try out basic_clean
words = basic_clean(''.join(str(df['tweet'].tolist())))

In [0]:
#Top 10 bigrams
%%capture
(pd.Series(nltk.ngrams(words, 2)).value_counts())[:10]

In [0]:
#top 10 trigrams
%%capture
(pd.Series(nltk.ngrams(words, 3)).value_counts())[:10]

##Visualization of N-grams

In [0]:
bigrams_series = (pd.Series(nltk.ngrams(words, 2)).value_counts())[:20]
trigrams_series = (pd.Series(nltk.ngrams(words, 3)).value_counts())[:20]

###Bigrams and Trigrams

In [0]:
#Bigrams chart
bigrams_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('20 Most Frequently Occuring Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# of Occurences')

In [0]:
#Trigrams chart
trigrams_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('20 Most Frequently Occuring Trigrams')
plt.ylabel('Trigram')
plt.xlabel('# of Occurences')

##Hashtags

In [0]:
#Function for cleaning the hashtags column and collecting into a list.
def tag_clean(text):
  """
  Simplified version of the basic_clean function. This returns a list of hashtags.
  """
  wnl = nltk.stem.WordNetLemmatizer()
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words]

In [0]:
#Assemble tag_clean output into list
tags = tag_clean(''.join(str(df['hashtags'].tolist())))

In [0]:
#Gather top hashtags
hashtags_series = (pd.Series(tags).value_counts())[:30]

###Visualization of top hashtags

In [0]:
#Show chart of top hashtags
hashtags_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('30 Most Frequent Hashtags')
plt.ylabel('Hashtag')
plt.xlabel('# of Occurences')

# Output
The Excel sheet "dipstick_out.xlsx" can be found in your file folders sidebar on Google Colab.

In [0]:
#Save dataframes to separate sheets in an Excel workbook.
with pd.ExcelWriter('dipstick_out.xlsx') as writer:
  report_table.to_excel(writer, sheet_name='Overview report')
  most_l.to_excel(writer, sheet_name='Most likes')
  most_r.to_excel(writer, sheet_name='Most retweets')
  most_tweets.to_excel(writer, sheet_name='Most tweets by user')
  bigrams_series.to_excel(writer, sheet_name='Top bigrams (stopwords removed')
  trigrams_series.to_excel(writer, sheet_name='Top trigrams')
  hashtags_series.to_excel(writer, sheet_name='Top hashtags')
  df.to_excel(writer, sheet_name='All tweets')