In [1]:
from temp_clean_tweets_dataframe import Clean_Tweets

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import json
import pandas as pd
from textblob import TextBlob

In [4]:
def read_json(json_file: str)->list:
    """
    json file reader to open and read json files into a list
    Args:
    -----
    json_file: str - path of a json file
    
    Returns
    -------
    length of the json file and a list of json
    """
    
    tweets_data = []
    for tweets in open(json_file,'r'):
        tweets_data.append(json.loads(tweets))
    return len(tweets_data), tweets_data


class TweetDfExtractor:
    """
    this function will parse tweets json into a pandas dataframe
    
    Return
    ------
    dataframe
    """
    def __init__(self, tweets_list):
        self.tweets_list = tweets_list
        

    # an example function
    def find_statuses_count(self)->list:
        statuses_count = [i['user']['statuses_count'] for i in self.tweets_list]
        return statuses_count 
        
        
    def find_full_text(self)->list:
        text = list()
        for tweet in self.tweets_list:
            if 'retweeted_status' in tweet.keys() and 'extended_tweet' in tweet['retweeted_status'].keys():
                text.append(tweet['retweeted_status']['extended_tweet']['full_text'])
            else: text.append('Empty') 
        return text

       
    def find_sentiments(self, text)->list:
        polarity, subjectivity = [], []
        for tweet in text:
            blob = TextBlob(tweet)
            sentiment = blob.sentiment
            polarity.append(sentiment.polarity)
            subjectivity.append(sentiment.subjectivity)
        return polarity, subjectivity
    

    def find_created_time(self)->list:
        created_at = [x['created_at'] for x in self.tweets_list]
        return created_at
    

    def find_source(self)->list:
        source = [i['source'] for i in self.tweets_list]
        return source
    

    def find_screen_name(self)->list:
        screen_name = [i['user']['screen_name'] for i in self.tweets_list]
        return screen_name
    

    def find_followers_count(self)->list:
        followers_count = [i['user']['followers_count'] for i in self.tweets_list]
        return followers_count 
    

    def find_friends_count(self)->list:
        friends_count = [i['user']['friends_count'] for i in self.tweets_list]
        return friends_count
    

    def is_sensitive(self)->list:
        try:
            is_sensitive = [x['possibly_sensitive'] for x in self.tweets_list]
        except KeyError:
            is_sensitive = None
        return is_sensitive
    

    def find_favourite_count(self)->list:
        favorite_count = []
        for tweet in self.tweets_list:
            if 'retweeted_status' in tweet.keys():
                favorite_count.append(tweet['retweeted_status']['favorite_count'])
            else: favorite_count.append(0)
        return favorite_count
    
    
    def find_retweet_count(self)->list:
        retweet_count = []
        for tweet in self.tweets_list:
            if 'retweeted_status' in tweet.keys():
                retweet_count.append(tweet['retweeted_status']['retweet_count'])
            else: retweet_count.append(0)
        return retweet_count
    

    def find_hashtags(self)->list:
        hashtags = list()
        for i in self.tweets_list:
            hashtags.append(", ".join([hashtag_item['text'] for hashtag_item in i['entities']['hashtags']]))
        return hashtags
    

    def find_mentions(self)->list:
        mentions = list()
        for i in self.tweets_list:
            mentions.append( ", ".join([mention['screen_name'] for mention in i['entities']['user_mentions']]))
        return mentions


    def find_location(self)->list:
        try:
            location = self.tweets_list['user']['location']
        except TypeError:
            location = ''
        return location
    
    
    def find_lang(self)->list:
        lang = [i['lang'] for i in self.tweets_list]
        return lang
    
        
    def get_tweet_df(self, save=False)->pd.DataFrame:
        """required column to be generated you should be creative and add more features"""
        
        columns = ['created_at', 'source', 'original_text','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
            'original_author', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place']
        
        created_at = self.find_created_time()
        source = self.find_source()
        text = self.find_full_text()
        polarity, subjectivity = self.find_sentiments(text)
        lang = self.find_lang()
        fav_count = self.find_favourite_count()
        retweet_count = self.find_retweet_count()
        screen_name = self.find_screen_name()
        follower_count = self.find_followers_count()
        friends_count = self.find_friends_count()
        sensitivity = self.is_sensitive()
        hashtags = self.find_hashtags()
        mentions = self.find_mentions()
        location = self.find_location()
        data = {'created_at':created_at, 'source':source, 'text':text, 'polarity':polarity, 'subjectivity':subjectivity, 'lang':lang, 'fav_count':fav_count, 'retweet_count':retweet_count, 'screen_name':screen_name, 'follower_count':follower_count, 'friends_count':friends_count, 'sensitivity':sensitivity, 'hashtags':hashtags, 'mentions':mentions, 'location':location}
        df = pd.DataFrame(data=data, columns=columns)

        if save:
            df.to_csv('processed_tweet_data.csv', index=False)
            print('File Successfully Saved.!!!')
        
        return df 

In [5]:
if __name__ == "__main__":
    # required column to be generated you should be creative and add more features
    
    columns = ['created_at', 'source', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
    'original_author', 'screen_count', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place', 'place_coord_boundaries']
    
    _, tweet_list = read_json("data/Economic_Twitter_Data.json")
    tweet = TweetDfExtractor(tweet_list)
    tweet_df = tweet.get_tweet_df() 

In [6]:
tweet_df.shape

(24625, 15)

In [7]:
ct = Clean_Tweets(tweet_df)

Automation in Action...!!!


In [8]:
tweet_df.head()

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Fri Apr 22 22:20:18 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",,-0.1,0.5,de,,355,,,12,,,,
1,Fri Apr 22 22:19:16 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",,-0.1,0.5,de,,505,,,12,,,,
2,Fri Apr 22 22:17:28 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",,-0.1,0.5,de,,4,,,12,,,,
3,Fri Apr 22 22:17:20 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",,-0.1,0.5,de,,332,,,12,,"Deutschen, Spritpreisen, inflation, Abgaben",,
4,Fri Apr 22 22:13:15 +0000 2022,"<a href=""http://twitter.com/download/android"" ...",,-0.1,0.5,de,,386,,,12,,,,


# Exploratory Data Analysis

In [9]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24625 entries, 0 to 24624
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   created_at          24625 non-null  object 
 1   source              24625 non-null  object 
 2   original_text       0 non-null      object 
 3   polarity            24625 non-null  float64
 4   subjectivity        24625 non-null  float64
 5   lang                24625 non-null  object 
 6   favorite_count      0 non-null      object 
 7   retweet_count       24625 non-null  int64  
 8   original_author     0 non-null      object 
 9   followers_count     0 non-null      object 
 10  friends_count       24625 non-null  int64  
 11  possibly_sensitive  0 non-null      object 
 12  hashtags            24625 non-null  object 
 13  user_mentions       0 non-null      object 
 14  place               0 non-null      object 
dtypes: float64(2), int64(2), object(11)
memory usage: 2.8

In [10]:
df_new = ct.convert_to_datetime(tweet_df)

In [11]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24514 entries, 0 to 24624
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   created_at          24514 non-null  datetime64[ns, UTC]
 1   source              24514 non-null  object             
 2   original_text       0 non-null      object             
 3   polarity            24514 non-null  float64            
 4   subjectivity        24514 non-null  float64            
 5   lang                24514 non-null  object             
 6   favorite_count      0 non-null      object             
 7   retweet_count       24514 non-null  int64              
 8   original_author     0 non-null      object             
 9   followers_count     0 non-null      object             
 10  friends_count       24514 non-null  int64              
 11  possibly_sensitive  0 non-null      object             
 12  hashtags            24514 non-nu

In [12]:
df_new = ct.remove_non_english_tweets(df_new)

In [19]:
df_new.head()

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
38,2022-04-22 22:17:05+00:00,"<a href=""http://twitter.com/download/android"" ...",,-0.1,0.5,en,,43,,,55,,"gold, silver, crypto",,
39,2022-04-22 13:44:53+00:00,"<a href=""http://twitter.com/download/android"" ...",,-0.1,0.5,en,,32,,,55,,,,
41,2022-04-22 06:10:34+00:00,"<a href=""http://twitter.com/download/android"" ...",,-0.1,0.5,en,,26,,,55,,,,
42,2022-04-21 17:22:09+00:00,"<a href=""http://twitter.com/download/android"" ...",,-0.1,0.5,en,,213,,,55,,,,
43,2022-04-21 10:32:26+00:00,"<a href=""http://twitter.com/download/android"" ...",,-0.1,0.5,en,,417,,,55,,,,


In [20]:
df_new.describe()

Unnamed: 0,polarity,subjectivity,retweet_count,friends_count
count,16374.0,16374.0,16374.0,16374.0
mean,-0.1,0.5,626.303286,1337.912056
std,2.405094e-14,0.0,7104.395997,2975.315626
min,-0.1,0.5,0.0,0.0
25%,-0.1,0.5,0.0,113.0
50%,-0.1,0.5,0.0,437.0
75%,-0.1,0.5,32.0,1433.0
max,-0.1,0.5,434379.0,41866.0
