In [1]:
from twython import Twython
from dotenv import load_dotenv
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import datetime

import time

In [2]:
load_dotenv()

CONSUMER_KEY = os.getenv('API_KEY')
CONSUMER_SECRET = os.getenv('API_KEY_SECRET')

twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)

In [3]:
countries_list = ['Afghanistan', "Armenia", "Australia", 
                  "Azerbaijan", "Bangladesh", "Bhutan", 
                  "Brunei", "Cambodia", "China", 
                 "North Korea", "Fiji", "Georgia",
                 "India", "Indonesia", "Iran", 
                 "Japan", "Kazakhstan", "Kiribati", 
                 "Kyrgyzstan", "Laos", "Malaysia", 
                 "Maldives", "Marshall Islands", 
                 "Micronesia", "Mongolia", "Myanmar", 
                 "Nauru", "Nepal", "New Zealand", 
                 "Pakistan", "Palau", "Papua New Guinea",
                 "Philippines", "South Korea", "Russia", 
                 "Samoa", "Singapore", "Solomon Islands", 
                 "Sri Lanka", "Tajikistan", "Thailand", 
                 "Timor-Leste", "Tonga", "Turkey", 
                 "Turkmenistan", "Tuvalu", "Uzbekistan", 
                 "Vanuatu", "Vietnam"]

keywords = ['environment', 'climate', 'CO2']


In [4]:
topics = {
    'Agriculture': ["Agriculture", "Meat", "Rice", "Fertilizer"], 
    'Other Fuel Combustion': ['Fuel', 'Forest fire', 'Wildfire', 'Biomass combustion'], 
    
    'Electricity/Heat': ['Electricity', 'Coal', 'Oil', 'Renewable', 'Gas'],
    'Energy': ['Energy'], 
    
    
    'Fugitive Emissions': ['Mining', 'Fugitive emissions'], 
    'Industrial Processes': ['Industrial', 'Cement', 'Chemical industry'], 
    'Land-Use Change and Forestry': ['Forestry', 'Forest', 'Deforestation', 'Wood'], 
    'Manufacturing/Construction': ['Manufacturing', 'Construction', 'Chemicals', 'Iron', 'Cloth'], 

    'Transportation': ['Transportation', 'Car', 'Plane', 'Train'], 
    'Waste': ['Waste', 'Landfill', 'Wastewater', 'Sewage']
}

In [5]:
for topic in topics:
    print(topic)
    #for subtopic in topics[topic]:
     #   print(subtopic)

Agriculture
Other Fuel Combustion
Electricity/Heat
Energy
Fugitive Emissions
Industrial Processes
Land-Use Change and Forestry
Manufacturing/Construction
Transportation
Waste


In [6]:
def create_query_country(country, keyword):
    query = {'q': f'{country} {keyword}',
        'result_type': 'popular',
        'count': 100,
        'lang': 'en',
        }
    
    return query

def create_query_geocode(geocode, keyword):
    query = {'q': f'{keyword}',
             'geocode':str(geocode),
        'result_type': 'recent',
        'count': 100,
        'lang': 'en',
        }
    
    return query

def create_query_geocode_topic(geocode, keyword, topic, until):
    query = {'q': f'{keyword} {topic}',
             'geocode':str(geocode),
        'result_type': 'recent',
        'count': 100,
        'lang': 'en',
        'until': str(until)
        }
    
    return query



In [7]:
def get_country(status):
    if status['place'] is not None: 
        
        if status['place']['country'] is not None:
        
            return status['place']['country']
    else:
        
        if status['user']['location'] is not None:
            
            location = status['user']['location']
            
            loc_country = np.array(countries_list)[[country in location for country in countries_list]]
            
            if len(loc_country) != 0:
                
                return loc_country[0]
            
            else:
                
                return 'Unknow'       
        else:
            
            return 'Unknown'
            
            
def get_url(status):
    
    if status['truncated']:
        
        return status['entities']['urls'][0]['url']
    
    else:
        return 'no url'

In [18]:
def get_tweets_from_geocode_and_topics(twitter, keywords, topics, geocode = "4.25329,137.19658,7387km"):
    
    # suggested radius = 7387.93 km 
    # suggested geocode = 4.25329,137.19658 (lat long)
    
    dict_ = {'country': [], 'keyword': [], 'topic': [], 'subtopic':[], 'user': [], 
             'date': [], 'text': [], 'favorite_count': [], 'followers_count': [], 'url': []}
    
    twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)

    for topic in topics:
        
        for subtopic in topics[topic]:
            
            for keyword in keywords:
                
                
                last_date = datetime.datetime.today().date()
                
                
                while abs( (last_date - datetime.datetime.today().date()).days ) < 20:
                    
                    query = create_query_geocode_topic(geocode, keyword, subtopic, str(last_date))
                    tweets = twitter.search(**query)
                    if int(twitter.get_lastfunction_header('x-rate-limit-remaining')) == 1:
                        print('Rate limit : ', twitter.get_lastfunction_header('x-rate-limit-remaining'))
                        print('Function paused ...')
                        
                        time.sleep(900)
                        
                        twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)
                    
                    for status in tweets['statuses']:
                        dict_['country'].append(get_country(status))
                        dict_['keyword'].append(keyword)
                        dict_['topic'].append(topic)
                        dict_['subtopic'].append(subtopic)
                        dict_['user'].append(status['user']['screen_name'])
                        dict_['date'].append(status['created_at'])
                        dict_['text'].append(status['text'])
                        dict_['favorite_count'].append(status['favorite_count'])
                        dict_['followers_count'].append(status['user']['followers_count'])
                        dict_['url'].append(get_url(status))
                        
                        if last_date > pd.to_datetime(status['created_at']).date():
                            
                            last_date = pd.to_datetime(status['created_at']).date()
                            

                    last_date = last_date - datetime.timedelta(days = 1)

    df = pd.DataFrame(dict_)
    df.sort_values(by='favorite_count', inplace=True, ascending=False)
    df.reset_index(inplace=True, drop=True)
    
    return df

In [19]:
df_geocode_topic = get_tweets_from_geocode_and_topics(twitter, keywords, topics)

Rate limit :  1
Function paused ...
Rate limit :  1
Function paused ...
Rate limit :  1
Function paused ...


In [20]:
df_geocode_topic

Unnamed: 0,country,keyword,topic,subtopic,user,date,text,favorite_count,followers_count,url
0,India,climate,Other Fuel Combustion,Fuel,LicypriyaK,Thu Feb 18 17:45:17 +0000 2021,India is doing great job in Solar Power despit...,493,126430,https://t.co/cPaVAhWc2m
1,India,climate,Land-Use Change and Forestry,Forest,ANI,Tue Feb 16 08:31:11 +0000 2021,Delhi: COP26 (26th UN Climate Change Conferenc...,329,5219539,https://t.co/QKJHrhTL2C
2,India,environment,Waste,Waste,Secretary_MoHUA,Sun Feb 14 08:27:35 +0000 2021,Unity opens the path to the Swachhatam Bharat!...,244,31661,https://t.co/OtrUEFY9KX
3,India,environment,Waste,Waste,AashrithaGundu,Fri Feb 12 09:42:39 +0000 2021,Distribution of 400 Ecofemme Reusable #Sanitar...,239,551,https://t.co/gO7meXCS6v
4,India,environment,Manufacturing/Construction,Chemicals,Pritesh7994,Thu Feb 18 09:03:45 +0000 2021,NOCIL enjoys monopoly business. Only two compa...,231,849,https://t.co/GqcnrsWOhW
...,...,...,...,...,...,...,...,...,...,...
5186,Unknow,climate,Electricity/Heat,Gas,mahejabeen25,Thu Feb 18 06:10:58 +0000 2021,RT @Ms_Aflatoon: The organizations working for...,0,236,no url
5187,Unknow,climate,Electricity/Heat,Gas,Lovescience15,Thu Feb 18 07:03:15 +0000 2021,RT @SowmyaRaj: By utilizing a combination of #...,0,488,no url
5188,India,climate,Electricity/Heat,Gas,JameelShaikhn,Thu Feb 18 07:48:48 +0000 2021,RT @Ms_Aflatoon: The organizations working for...,0,198,no url
5189,Unknow,climate,Electricity/Heat,Gas,phaniku36183927,Thu Feb 18 08:51:18 +0000 2021,RT @Ms_Aflatoon: The organizations working for...,0,15,no url


In [21]:
df_geocode_topic.drop_duplicates()

Unnamed: 0,country,keyword,topic,subtopic,user,date,text,favorite_count,followers_count,url
0,India,climate,Other Fuel Combustion,Fuel,LicypriyaK,Thu Feb 18 17:45:17 +0000 2021,India is doing great job in Solar Power despit...,493,126430,https://t.co/cPaVAhWc2m
1,India,climate,Land-Use Change and Forestry,Forest,ANI,Tue Feb 16 08:31:11 +0000 2021,Delhi: COP26 (26th UN Climate Change Conferenc...,329,5219539,https://t.co/QKJHrhTL2C
2,India,environment,Waste,Waste,Secretary_MoHUA,Sun Feb 14 08:27:35 +0000 2021,Unity opens the path to the Swachhatam Bharat!...,244,31661,https://t.co/OtrUEFY9KX
3,India,environment,Waste,Waste,AashrithaGundu,Fri Feb 12 09:42:39 +0000 2021,Distribution of 400 Ecofemme Reusable #Sanitar...,239,551,https://t.co/gO7meXCS6v
4,India,environment,Manufacturing/Construction,Chemicals,Pritesh7994,Thu Feb 18 09:03:45 +0000 2021,NOCIL enjoys monopoly business. Only two compa...,231,849,https://t.co/GqcnrsWOhW
...,...,...,...,...,...,...,...,...,...,...
5186,Unknow,climate,Electricity/Heat,Gas,mahejabeen25,Thu Feb 18 06:10:58 +0000 2021,RT @Ms_Aflatoon: The organizations working for...,0,236,no url
5187,Unknow,climate,Electricity/Heat,Gas,Lovescience15,Thu Feb 18 07:03:15 +0000 2021,RT @SowmyaRaj: By utilizing a combination of #...,0,488,no url
5188,India,climate,Electricity/Heat,Gas,JameelShaikhn,Thu Feb 18 07:48:48 +0000 2021,RT @Ms_Aflatoon: The organizations working for...,0,198,no url
5189,Unknow,climate,Electricity/Heat,Gas,phaniku36183927,Thu Feb 18 08:51:18 +0000 2021,RT @Ms_Aflatoon: The organizations working for...,0,15,no url


In [22]:
df_geocode_topic.topic.value_counts()

Electricity/Heat                998
Transportation                  754
Land-Use Change and Forestry    692
Agriculture                     653
Energy                          632
Manufacturing/Construction      427
Other Fuel Combustion           381
Waste                           347
Industrial Processes            230
Fugitive Emissions               77
Name: topic, dtype: int64

In [23]:
df_geocode_topic.to_csv("tweets_datasets.csv")

In [24]:
df_geocode_topic["keyword"].value_counts()

environment    2759
climate        2212
CO2             220
Name: keyword, dtype: int64