This notebook show how to generate a pandas dataframe from several JSON files with tweets captures using the Stream API. 
Particularly, two dataframes are generated:
* A dataframe with tweet and user info
* A dataframe with location info of the tweets

# Configuration

* Connection with Master notebook is configures
* DIfferent variables are set

In [1]:
# check if IS_MASTER exists, this variable will only exist if it's being called by MASTER notebook.
# if it does not exist, set it to False
try: IS_MASTER
except: IS_MASTER = False
# The code below will only run if it's NOT being called from MASTER notebook
if not IS_MASTER:
    # targetFiles = 'HealthyFood'
    # targetFiles = 'UnhealthyFood'
    targetFiles = 'GenericTweets'
    DATA_DIR = './data/temp/' # 
#The code expects a folder (targetFiles) with the next tree structure inside:
# - Name of captured dataset (JSON files)
# |- Year
#  |- Month
#   |- Day
#    |- JSON files
RAW_JSON_DIR = f'./data/JSONFiles/{targetFiles}' #Path to the root folder of JSON files containing tweets
RAW_CSV_DIR = f'./data/CSVFiles/'
PROCESSED_CSV_FILE = f'./{RAW_CSV_DIR}/{targetFiles}-DataFrame.csv'
PROCESSED_LOCATION_CSV_FILE = f'./{RAW_CSV_DIR}/{targetFiles}-LocationsDataframe.csv' # use "small data" here

In [2]:
#Libraries used in the notebook
import csv
import json
import numpy as np
import pandas as pd
import glob
import os
import ntpath
import pickle
import shutil


# 1: Transform the json files to csv

In [3]:
#We get a list with all target JSON files

files = glob.glob(f'{RAW_JSON_DIR}/**/*.json', recursive=True)
files.sort()
filenames = []
for file in files:
    filename=os.path.splitext(file)[0]
    filename = ntpath.basename(filename)
    filenames.append(filename)

filenames

['2019_11_13_13_36_41-Copy1',
 '2019_11_13_14_36_42-Copy1',
 '2019_11_13_15_36_43-Copy1',
 '2019_11_13_16_36_43-Copy1',
 '2019_11_13_17_36_43-Copy1',
 '2019_11_13_18_36_44-Copy1']

In [4]:
# Convert JSON to CSV files
# It may take some time
i = 0
NEW_FILES = 0
outdir = RAW_CSV_DIR + 'temp/'
if not os.path.exists(outdir):
    os.mkdir(outdir)
failed_files = []                              #JSON files in this list need a ']' at the end due to interrupts during the stream capture

for file in files:
    outname = '{}'.format(filenames[i])+'.csv'
    fullname = os.path.join(outdir, outname)
    i+=1

    if not os.path.exists(fullname):
        print('Creating {}'.format(fullname))
        with open(file, encoding='utf-8-sig') as f_input:
            try:
                df = pd.read_json(f_input)
            except:
                failed_files.append(f_input)
            
        df.to_csv(fullname , index=False)
        NEW_FILES +=1

print('{} new csv files has been created'.format(NEW_FILES))

Creating ./data/CSVFiles/temp/2019_11_13_13_36_41-Copy1.csv
Creating ./data/CSVFiles/temp/2019_11_13_14_36_42-Copy1.csv
Creating ./data/CSVFiles/temp/2019_11_13_15_36_43-Copy1.csv
Creating ./data/CSVFiles/temp/2019_11_13_16_36_43-Copy1.csv
Creating ./data/CSVFiles/temp/2019_11_13_17_36_43-Copy1.csv
Creating ./data/CSVFiles/temp/2019_11_13_18_36_44-Copy1.csv
6 new csv files has been created


In [5]:
failed_files

[]

# 2: Create a dataframe

In [21]:
#files = glob.glob(f'{RAW_CSV_DIR}temp/*.csv')
files = glob.glob(f'{RAW_CSV_DIR}/smallData-DataFrame.csv')
files.sort()
files

['./data/CSVFiles//smallData-DataFrame.csv']

In [24]:
# Join all csv files in one dataframe in case there are new files
#TODO:It could have less computational cost if we only join new csv files

if NEW_FILES != 0:

    df = pd.DataFrame()
    print(df.shape)

    li = []

    for filename in files:
        df_temp = pd.read_csv(filename, index_col=None, header=0, lineterminator='\n')
        df_temp = df_temp[['coordinates','created_at','geo','id','id_str','lang','place',
                           'text','user', 'extended_tweet', 'truncated', 'extended_entities']]
        li.append(df_temp)

    df = pd.concat(li, axis=0, ignore_index=True)
    print(df.shape)

    #shutil.rmtree(outdir)
    
    #Filter RTs

    df = df[df['text'].str.contains("RT @") == False]
    df = df.reset_index()
    
    print(df.shape)

(0, 0)
(13525, 12)
(13525, 13)


In [25]:
#Get full text from extended tweet
def getFullText(truncated, extended_tweet):
    if truncated:
        a = extended_tweet.replace("'", '"')
        try:
            b = json.loads(a)
            res = b['full_text']
        except:
            res = None
        return res

In [26]:
if NEW_FILES != 0:
    df['full_text'] = df.apply(lambda x: getFullText(x['truncated'], x['extended_tweet']) , axis=1)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13525 entries, 0 to 13524
Data columns (total 14 columns):
index                13525 non-null int64
coordinates          50 non-null object
created_at           13525 non-null object
geo                  50 non-null object
id                   13525 non-null float64
id_str               13525 non-null float64
lang                 13525 non-null object
place                525 non-null object
text                 13525 non-null object
user                 13525 non-null object
extended_tweet       3312 non-null object
truncated            13525 non-null float64
extended_entities    653 non-null object
full_text            2780 non-null object
dtypes: float64(3), int64(1), object(10)
memory usage: 1.4+ MB


In [28]:
def getText2Proc(fullText, text):
    if fullText is not None:
        return (fullText)
    else:
        return (text)

In [29]:
df['text2Proc'] = df.apply(lambda x: getText2Proc(x['full_text'], x['text']) , axis=1)
df.head()


Unnamed: 0,index,coordinates,created_at,geo,id,id_str,lang,place,text,user,extended_tweet,truncated,extended_entities,full_text,text2Proc
0,0,,2019-11-07 15:51:39.000,,1.19247e+18,1.19247e+18,es,,Nunca voy a entender por qué #Bienvenidos13 si...,"{'id': 284654792, 'id_str': '284654792', 'name...",{'full_text': 'Nunca voy a entender por qué #B...,1.0,,Nunca voy a entender por qué #Bienvenidos13 si...,Nunca voy a entender por qué #Bienvenidos13 si...
1,1,,2019-11-07 15:52:52.000,,1.19247e+18,1.19247e+18,es,,@MalenaAubone NooOOoOOo man voy a comprar hela...,"{'id': 3029655005, 'id_str': '3029655005', 'na...",,0.0,,,@MalenaAubone NooOOoOOo man voy a comprar hela...
2,2,,2019-11-07 15:53:49.000,,1.19247e+18,1.19247e+18,es,,"Estamos en una parri con mi viejo, al costado ...","{'id': 602970512, 'id_str': '602970512', 'name...",{'full_text': 'Estamos en una parri con mi vie...,1.0,,"Estamos en una parri con mi viejo, al costado ...","Estamos en una parri con mi viejo, al costado ..."
3,3,,2019-11-07 15:54:03.000,,1.19247e+18,1.19247e+18,es,,No nos extinguimos de pedo,"{'id': 1726824211, 'id_str': '1726824211', 'na...",,0.0,,,No nos extinguimos de pedo
4,4,,2019-11-07 15:55:01.000,,1.192471e+18,1.192471e+18,es,,Desde las 11am. Que pienso en el helado que es...,"{'id': 185476629, 'id_str': '185476629', 'name...",,0.0,,,Desde las 11am. Que pienso en el helado que es...


In [30]:
outdir = RAW_CSV_DIR
if not os.path.exists(outdir):
    os.mkdir(outdir)
df.to_csv(PROCESSED_CSV_FILE, index=False)

# 3: Get tweet location

In [14]:
# Get user location
users = df.user.values
locations = []

if NEW_FILES != 0:
    for user in users:
        keyword = "'location':"
        before_keyword, keyword, after_keyword = user.partition(keyword)
        my_string = after_keyword
        keyword = "'url':"
        before_keyword, keyword, after_keyword = my_string.partition(keyword)
        user_location = before_keyword.replace(',','').replace("\'",'').replace('\"','').strip()
        locations.append(user_location)
    df = df.assign(user_location = locations)
    df = df.replace('None', np.NaN)

In [15]:
# Get tweet location

if NEW_FILES != 0:
    place = df.place

    longitude = []
    latitude = []
    country = []
    city = []
    n=0
    for i in place.values:
        a = ''
        if not isinstance(i, float):
            a = i.replace("'", '"')

        try:
            b = json.loads(a)
            n = n + 1
            longitude.append(b['bounding_box']['coordinates'][0][0][0])
            latitude.append(b['bounding_box']['coordinates'][0][0][1])
            country.append(b['country'])
            city.append(b['name'])
        except ValueError:
            longitude.append(np.NaN)
            latitude.append(np.NaN)
            country.append(np.NaN)
            city.append(np.NaN)
            continue
    df['Country'] = country
    df['City'] = city
    df['Longitude'] = longitude
    df['Latitude'] = latitude
    df['Coordinates'] = list(zip(df.Longitude, df.Latitude))

    df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14703 entries, 0 to 14702
Data columns (total 21 columns):
index                14703 non-null int64
coordinates          75 non-null object
created_at           14703 non-null object
geo                  75 non-null object
id                   14703 non-null int64
id_str               14703 non-null int64
lang                 14703 non-null object
place                600 non-null object
text                 14703 non-null object
user                 14703 non-null object
extended_tweet       5959 non-null object
truncated            14703 non-null bool
extended_entities    595 non-null object
full_text            4801 non-null object
text2Proc            14703 non-null object
user_location        10331 non-null object
Country              598 non-null object
City                 598 non-null object
Longitude            598 non-null float64
Latitude             598 non-null float64
Coordinates          14703 non-null object
dtypes: boo

In [16]:
df_original = df


### Create a dataframe with locations for statistical analysis

In [17]:
df = df[['Country', 'City', 'Longitude', 'Latitude']][df['Country'].notna()]
df['Coordinates'] = list(zip(df.Longitude, df.Latitude))
df


Unnamed: 0,Country,City,Longitude,Latitude,Coordinates
30,España,Palomeque,-4.017543,40.095033,"(-4.017543, 40.095033)"
38,España,Valencia,-0.432545,39.278381,"(-0.43254499999999996, 39.278381)"
49,España,Oviedo,-6.020100,43.278867,"(-6.0201, 43.278867)"
68,Peru,Lince,-77.045913,-12.093449,"(-77.045913, -12.093449)"
80,Argentina,Villa Soldati,-58.502543,-34.705421,"(-58.502543, -34.705421)"
117,México,Guadalupe,-100.274823,25.621688,"(-100.274823, 25.621688)"
148,España,Lloret de Mar,2.775933,41.686311,"(2.775933, 41.686311)"
157,Venezuela,Mérida,-71.195747,8.566088,"(-71.195747, 8.566088)"
178,Argentina,Ciudad Autónoma de Buenos Aires,-58.531792,-34.674453,"(-58.531792, -34.674453)"
214,Peru,La Molina,-76.972519,-12.124810,"(-76.972519, -12.12481)"


In [18]:
df.to_csv(PROCESSED_LOCATION_CSV_FILE, index=False)

In [19]:
#get tweets from Spain
df = df_original
df.loc[df['Country'] == "España", 'Country'] = 'Spain'
df.loc[df['Country'] == "Espanya", 'Country'] = 'Spain'
df_spain = df = df[df.Country == 'Spain']
df_spain = df_spain.reset_index()
df_spain

Unnamed: 0,level_0,index,coordinates,created_at,geo,id,id_str,lang,place,text,...,truncated,extended_entities,full_text,text2Proc,user_location,Country,City,Longitude,Latitude,Coordinates
0,30,87,,2019-11-13 13:37:28.000,,1194610231965364224,1194610231965364224,es,"{'id': '369ff584feaf6201', 'url': 'https://api...",La veleta naranja se rompió de la tanto giro.\...,...,False,,,La veleta naranja se rompió de la tanto giro.\...,,Spain,Palomeque,-4.017543,40.095033,"(-4.017543, 40.095033)"
1,38,105,,2019-11-13 13:37:42.000,,1194610288701714433,1194610288701714432,es,"{'id': '071a52d3a927a1b8', 'url': 'https://api...",Toma Geroma! Pastillas de goma! Dabuten tronco...,...,False,,,Toma Geroma! Pastillas de goma! Dabuten tronco...,València,Spain,Valencia,-0.432545,39.278381,"(-0.43254499999999996, 39.278381)"
2,49,136,,2019-11-13 13:37:59.000,,1194610359233122306,1194610359233122304,es,"{'id': 'd1272bba8714df3d', 'url': 'https://api...",@nancyacostatfe @Hibai_ @MabelFigueruelo De fr...,...,False,,,@nancyacostatfe @Hibai_ @MabelFigueruelo De fr...,,Spain,Oviedo,-6.020100,43.278867,"(-6.0201, 43.278867)"
3,148,506,,2019-11-13 13:41:35.000,,1194611265802518528,1194611265802518528,es,"{'id': '16d7e4bdce732b26', 'url': 'https://api...",Parece que mermelada se cree el único ser cult...,...,False,,,Parece que mermelada se cree el único ser cult...,Gerona Cataluña,Spain,Lloret de Mar,2.775933,41.686311,"(2.775933, 41.686311)"
4,232,782,"{'type': 'Point', 'coordinates': [-3.69051871,...",2019-11-13 13:44:14.000,"{'type': 'Point', 'coordinates': [40.42790112,...",1194611934433165312,1194611934433165312,es,"{'id': '206c436ce43a43a3', 'url': 'https://api...",Edamame en bartomate \n.\n.\n.\n#edamame #bart...,...,True,,Edamame en bartomate \n.\n.\n.\n#edamame #bart...,Edamame en bartomate \n.\n.\n.\n#edamame #bart...,Madrid,Spain,Madrid,-3.889005,40.312071,"(-3.889005, 40.312071)"
5,290,953,,2019-11-13 13:45:53.000,,1194612350529216512,1194612350529216512,es,"{'id': '8c86b8b4cb716103', 'url': 'https://api...",información inutil sobre mi:\n\naltura: 1'73\n...,...,True,,,información inutil sobre mi:\n\naltura: 1'73\n...,La Línea Cádiz,Spain,Sevilla,-6.028430,37.313613,"(-6.02843, 37.313613)"
6,291,958,,2019-11-13 13:45:57.000,,1194612367033880577,1194612367033880576,es,"{'id': '00ab08eef1b62e92', 'url': 'https://api...",Hoy está el día para un buen plato de lentejas...,...,False,,,Hoy está el día para un buen plato de lentejas...,Las Palmas de Gran Canaria,Spain,Las Palmas de Gran Canaria,-15.525504,28.024813,"(-15.525504, 28.024813)"
7,686,2256,,2019-11-13 13:57:50.000,,1194615358046507008,1194615358046507008,es,"{'id': '56933d34eb0f964a', 'url': 'https://api...",Este sabroso crumble de calabaza y avellanas e...,...,True,,Este sabroso crumble de calabaza y avellanas e...,Este sabroso crumble de calabaza y avellanas e...,Ángel 40 Albacete,Spain,Albacete,-2.181299,38.672083,"(-2.181299, 38.672083)"
8,944,3006,,2019-11-13 14:04:50.000,,1194617117833863168,1194617117833863168,es,"{'id': 'f4250a690d3b8039', 'url': 'https://api...","informacion super inutil de mi:\n\naltura: 1,5...",...,True,,"informacion super inutil de mi:\n\naltura: 1,5...","informacion super inutil de mi:\n\naltura: 1,5...",Euskal Herria,Spain,Zarautz,-2.209988,43.242870,"(-2.209988, 43.24287)"
9,1250,3896,"{'type': 'Point', 'coordinates': [-3.68333, 40...",2019-11-13 14:13:06.000,"{'type': 'Point', 'coordinates': [40.4, -3.683...",1194619197227196418,1194619197227196416,es,"{'id': '206c436ce43a43a3', 'url': 'https://api...",¡¡Que hambre!! ¿Y tú que comes hoy?? Yo unas ...,...,True,,¡¡Que hambre!! ¿Y tú que comes hoy?? Yo unas ...,¡¡Que hambre!! ¿Y tú que comes hoy?? Yo unas ...,Madrid,Spain,Madrid,-3.889005,40.312071,"(-3.889005, 40.312071)"


In [20]:
# Tweets with extended entities (multimedia) from Spain
df_spain[df_spain['extended_entities'].notna()]

Unnamed: 0,level_0,index,coordinates,created_at,geo,id,id_str,lang,place,text,...,truncated,extended_entities,full_text,text2Proc,user_location,Country,City,Longitude,Latitude,Coordinates
18,1837,5733,,2019-11-13 14:29:31.000,,1194623329220595712,1194623329220595712,es,"{'id': '206c436ce43a43a3', 'url': 'https://api...",Peludito de 2 escasos años q está perdiendo vi...,...,False,"{'media': [{'id': 1194623305468104704, 'id_str...",,Peludito de 2 escasos años q está perdiendo vi...,Madrid España,Spain,Madrid,-3.889005,40.312071,"(-3.889005, 40.312071)"
31,3205,10096,,2019-11-13 15:07:43.000,,1194632943685095424,1194632943685095424,es,"{'id': '342f9faf7392d4ce', 'url': 'https://api...",Las lentejas y la adjika casera (más verde que...,...,False,"{'media': [{'id': 1194632930758279169, 'id_str...",,Las lentejas y la adjika casera (más verde que...,🇪🇸 | 🇺🇦 | 🇷🇺,Spain,Logroño,-2.542232,42.429049,"(-2.542232, 42.429049)"
38,4049,12747,,2019-11-13 15:30:20.000,,1194638635213025282,1194638635213025280,es,"{'id': 'a4d75ab3c00c2563', 'url': 'https://api...",Celebremos el Día Mundial de la Dieta Mediterr...,...,False,"{'media': [{'id': 1194638616439480320, 'id_str...",,Celebremos el Día Mundial de la Dieta Mediterr...,Níjar (Almería),Spain,Níjar,-2.300484,36.719095,"(-2.300484, 36.719095)"
94,12979,38959,,2019-11-13 18:54:13.000,,1194689942691209217,1194689942691209216,es,"{'id': '0df6ac4d361d4e0e', 'url': 'https://api...",fruta https://t.co/RPPLyt2mSP,...,False,"{'media': [{'id': 1194689936139718661, 'id_str...",,fruta https://t.co/RPPLyt2mSP,,Spain,Vigo,-8.916355,42.12938,"(-8.916355, 42.12938)"
99,13169,39501,,2019-11-13 18:58:32.000,,1194691029565419520,1194691029565419520,es,"{'id': '206c436ce43a43a3', 'url': 'https://api...",@Pol588 @NebraskaGuy212 Te gusta el pimiento? ...,...,False,"{'media': [{'id': 1194691021571002374, 'id_str...",,@Pol588 @NebraskaGuy212 Te gusta el pimiento? ...,Madrid Comunidad de Madrid,Spain,Madrid,-3.889005,40.312071,"(-3.889005, 40.312071)"
102,13837,41343,,2019-11-13 19:14:46.000,,1194695117359087616,1194695117359087616,es,"{'id': '71d49086a15d032e', 'url': 'https://api...","Jamoncitos de pollo con tomate, la receta que ...",...,False,"{'media': [{'id': 1194695111118077952, 'id_str...",,"Jamoncitos de pollo con tomate, la receta que ...",Madrid,Spain,Sevilla la Nueva,-4.076755,40.33229,"(-4.076755, 40.33229)"
