<a href="https://colab.research.google.com/github/conceptbin/dipstick/blob/master/Dipstick_0_3_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#User input
Enter user variables in this section, then run all cells:

In [None]:
search_term = "Roehampton"
limit = 1000 #If you're just testing the search, set a smaller limit. The higher the limit, the longer the search takes to run.

#Search and save to pandas dataframe

In [None]:
#Install Twint for Twitter search
!pip3 install twint

In [None]:
import pandas as pd
import twint

c = twint.Config()
c.Search = search_term
c.Limit = limit
#c.Min_likes = 5 #Minimum number of likes, to just get tweets people interacted with.
c.Pandas = True

twint.run.Search(c)
df = twint.storage.panda.Tweets_df

#Basic Analysis

##User by frequency

In [None]:
most_tweets = df.groupby(['username']).size().reset_index(name='counts')
most_tweets = most_tweets.sort_values(by='counts', ascending=False)
most_tweets[:20]

##Likes

In [None]:
#Most likes
most_l = (df.nlargest(1000, 'nlikes') 
          .drop_duplicates(['tweet'])
          )
most_l = most_l.sort_values(by='nlikes', ascending=False)
most_l[:10][['date','username','tweet','nlikes']]  #Slice of list, selected columns

##Retweets

In [None]:
#Most retweeted
most_r = (df.nlargest(1000, 'nretweets') 
          .drop_duplicates(['tweet'])
          )
most_r[:10][['date','username','tweet','nretweets']]  #Slice of list, selected columns

##Overview

In [None]:
#Overview data
tweets_total = len(df)  #Total no. of tweets in the set
tweeters = len(df['username'].unique())  #No. of unique tweeters
time_from = df['date'].min()
time_to = df['date'].max()

In [None]:
#Gather overview data into a dict
report = {'What': ['Total no. of tweets:', 'No. of unique tweeters:', 'Time from:', 'Time to:'],
          'Number': [tweets_total, tweeters, time_from, time_to]}

In [None]:
#Make overview dataframe from dict and display
report_table = pd.DataFrame(report)
report_table

#Keywords, hashtags, etc.

##N-grams
Code adapted from De Dios, From Dataframe to N-Grams (Medium 22 May 2020) [link text](https://towardsdatascience.com/from-dataframe-to-n-grams-e34e29df3460).

In [None]:
#Import libraries 

# natural language processing: n-gram ranking
import re
import unicodedata
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

In [None]:
#Function for stripping stopwords, lemmatizing, removing punctuation.
def basic_clean(text):
  """
  Cleans the text data by removing stopwords, lemmatizing after encoding,
  punctuation removed with regex parsing. Returns a list of words.
  """
  wnl = nltk.stem.WordNetLemmatizer()
  stopwords = nltk.corpus.stopwords.words('english')
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [None]:
#try out basic_clean
words = basic_clean(''.join(str(df['tweet'].tolist())))

In [None]:
#Top 10 bigrams
(pd.Series(nltk.ngrams(words, 2)).value_counts())[:10]

In [None]:
#top 10 trigrams
(pd.Series(nltk.ngrams(words, 3)).value_counts())[:10]

##Visualization of N-grams

In [None]:
bigrams_series = (pd.Series(nltk.ngrams(words, 2)).value_counts())[:20]
trigrams_series = (pd.Series(nltk.ngrams(words, 3)).value_counts())[:20]

In [None]:
bigrams_series.head()

###Bigrams

In [None]:
bigrams_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('20 Most Frequently Occuring Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# of Occurences')

###Trigrams

In [None]:
trigrams_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('20 Most Frequently Occuring Trigrams')
plt.ylabel('Trigram')
plt.xlabel('# of Occurences')

##Hashtags

In [None]:
def tag_clean(text):
  wnl = nltk.stem.WordNetLemmatizer()
  #stopwords = nltk.corpus.stopwords.words('english')
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words]

In [None]:
#try out tag_clean
tags = tag_clean(''.join(str(df['hashtags'].tolist())))

In [None]:
#top hashtags
hashtags_series = (pd.Series(tags).value_counts())[:30]

##Visualization of top hashtags

In [None]:
hashtags_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('30 Most Frequent Hashtags')
plt.ylabel('Hashtag')
plt.xlabel('# of Occurences')

# Output
Saves everything to an Excel sheet.

In [None]:
#Save dataframes to separate sheets in an Excel workbook.
with pd.ExcelWriter('dipstick_out.xlsx') as writer:
  report_table.to_excel(writer, sheet_name='Overview report')
  most_l.to_excel(writer, sheet_name='Most likes')
  most_r.to_excel(writer, sheet_name='Most retweets')
  most_tweets.to_excel(writer, sheet_name='Most tweets by user')
  bigrams_series.to_excel(writer, sheet_name='Top bigrams (stopwords removed')
  trigrams_series.to_excel(writer, sheet_name='Top trigrams')
  hashtags_series.to_excel(writer, sheet_name='Top hashtags')
  df.to_excel(writer, sheet_name='All tweets')