Uploading raw datasets to S3 bucket.

In [2]:
import boto3
import pandas as pd
from sagemaker import get_execution_role
import tqdm

for i in tqdm.tqdm(range(0, 253)):
    obj = "spanish_tweets_2016_" + str(i) + ".json"
    obj_loc = "../../data/" + obj
    boto3.Session().resource('s3').Bucket("mt5599").Object("tweets/" + obj).upload_file(obj_loc)

100%|██████████| 253/253 [14:01<00:00,  3.33s/it]


In [None]:
import pandas as pd
import json
import re
import tqdm
from multiprocessing import Pool # multithreading
import gc

# write function to remove unnecessary columns
def keep_columns(df, # dataframe to be cleaned
                 columns): # list of columns to keep
    df = df[columns]
    return df

#write function to keep only tweets with location data
def has_loc(df):
    df = df[df.coordinates != "None"]
    df = df[df.place != "None"]
    df = df.reset_index(drop=True)
    return df


# helper function to extract place and coordinates
def safe_json_loads(string):
    try:
        string = json.loads(string)
    except:
        string = None
    return(string)


# extracting place components
def extract_place(df):
    
    df_coord = df
    
    df_coord['place_full_name'] = "None"
    df_coord['place_name'] = "None"
    df_coord['place_type'] = "None"
    df_coord['place_country'] = "None"
    df_coord['place_country_code'] = "None"

    for i in tqdm.tqdm(range(df_coord.shape[0])):

        if df_coord.place[i] != "None":
            try:
                split_by = "Place\(fullName='|', name='|', type='|', country='|', countryCode='|'\)"
                temp = re.split(split_by, df_coord.place[i])

                df_coord['place_full_name'][i] = temp[1]
                df_coord['place_name'][i] = temp[2]
                df_coord['place_type'][i] = temp[3]
                df_coord['place_country'][i] = temp[4]
                df_coord['place_country_code'][i] = temp[5]
            except:
                df_coord['place_full_name'][i] = None
                df_coord['place_name'][i] = None
                df_coord['place_type'][i] = None
                df_coord['place_country'][i] = None
                df_coord['place_country_code'][i] = None
                
    return df_coord


# extracting coordinates components
def extract_coordinates(df):
    
    df_coord = df
    
    df_coord['coordinates_longitude'] = "None"
    df_coord['coordinates_latitude'] = "None"

    for i in tqdm.tqdm(range(df_coord.shape[0])):

        if df_coord.coordinates[i] != "None":
            try:
                split_by = "Coordinates\(longitude=|, latitude=|\)"
                temp = re.split(split_by, df_coord.coordinates[i])

                df_coord['coordinates_longitude'][i] = temp[1]
                df_coord['coordinates_latitude'][i] = temp[2]

            except:
                df_coord['coordinates_longitude'][i] = None
                df_coord['coordinates_latitude'][i] = None
                
    return df_coord



def clean_df(filepath):
    
    print("working on ", filepath)
    
    print("reading in data")
    df = pd.read_json(filepath)

    print("removing unnecessary columns")
    df = keep_columns(df, ["id", "DateTime", "coordinates", "place", "username", "user_location"])

    print("filtering out tweets that have no location data")
    df = has_loc(df)

    print("extracting components of place")
    df = extract_place(df)

    print("extracting components of coordinates")
    df = extract_coordinates(df)
    
    print("saving")
    df.to_feather("s3://mt5599/tweets/spanish_tweets_2016_processed" + str(i) + ".feather")

    num = df.shape[0]
    
    print("done")
    
    del df
    gc.collect()
    
    return num


if __name__ == "__main__":
    
    filepaths = []
    for i in range(1, 253):
        filepath = "s3://mt5599/tweets/spanish_tweets_2016_" + str(i) + ".json"
        filepaths.append(filepath)
    
    total_tweets = 0
    for filepath in filepaths:
        tweet_no = clean_df(filepath)
        total_tweets = total_tweets + tweet_no
        print(total_tweets)

working on  s3://mt5599/tweets/spanish_tweets_2016_1.json
reading in data
removing unnecessary columns
filtering out tweets that have no location data


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

extracting components of place


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

extracting components of coordinates


100%|██████████| 347793/347793 [20:16<00:00, 285.99it/s]


saving
done
347793
working on  s3://mt5599/tweets/spanish_tweets_2016_2.json
reading in data
removing unnecessary columns
filtering out tweets that have no location data


  0%|          | 35/346203 [00:00<35:14, 163.71it/s]

extracting components of place


100%|██████████| 346203/346203 [1:06:18<00:00, 87.02it/s]
  0%|          | 86/346203 [00:00<13:35, 424.32it/s]

extracting components of coordinates


100%|██████████| 346203/346203 [19:43<00:00, 292.45it/s]


saving
done
693996
working on  s3://mt5599/tweets/spanish_tweets_2016_3.json
reading in data
removing unnecessary columns
filtering out tweets that have no location data


  0%|          | 31/395551 [00:00<45:38, 144.44it/s]

extracting components of place


100%|██████████| 395551/395551 [1:27:56<00:00, 74.96it/s]
  0%|          | 74/395551 [00:00<18:04, 364.79it/s]

extracting components of coordinates


100%|██████████| 395551/395551 [27:22<00:00, 240.79it/s]


saving
done
1089547
working on  s3://mt5599/tweets/spanish_tweets_2016_4.json
reading in data
removing unnecessary columns
filtering out tweets that have no location data


  0%|          | 37/307450 [00:00<29:00, 176.58it/s]

extracting components of place


100%|██████████| 307450/307450 [52:25<00:00, 97.76it/s] 
  0%|          | 96/307450 [00:00<10:51, 471.63it/s]

extracting components of coordinates


100%|██████████| 307450/307450 [15:27<00:00, 331.46it/s]


saving
done
1396997
working on  s3://mt5599/tweets/spanish_tweets_2016_5.json
reading in data
removing unnecessary columns
filtering out tweets that have no location data


  0%|          | 43/265931 [00:00<21:07, 209.72it/s]

extracting components of place


100%|██████████| 265931/265931 [38:44<00:00, 114.40it/s]
  0%|          | 110/265931 [00:00<08:10, 542.41it/s]

extracting components of coordinates


100%|██████████| 265931/265931 [10:48<00:00, 410.30it/s]


saving
done
1662928
working on  s3://mt5599/tweets/spanish_tweets_2016_6.json
reading in data
removing unnecessary columns
filtering out tweets that have no location data


  0%|          | 41/289829 [00:00<24:56, 193.70it/s]

extracting components of place


 43%|████▎     | 123343/289829 [13:00<23:21, 118.83it/s]

In [4]:
filepath = "s3://mt5599/tweets/spanish_tweets_2016_" + str(0) + ".json"

#print("starting to process ", filepath)
#df = pd.read_json(filepath)

print("removing unnecessary columns")
df = keep_columns(df, ["id", "DateTime", "coordinates", "place", "username", "user_location"])

print("keeping only tweets with location data")
df = has_loc(df)

print("extracting components of place")
df = extract_place(df)

print("extracting components of coodinates")
df = extract_coordinates(df)

df.to_feather("s3://mt5599/tweets/spanish_tweets_2016_processed_" + str(0) + ".feather")

print("done")
print()

removing unnecessary columns


NameError: name 'df' is not defined

Reading in datasets and combining them. Saving them as feather file in S3.

In [None]:
import boto3
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()

files = []
DataFrameName = []
for i in range(0, 253):
    files.append("s3://mt5599/tweets/spanish_tweets_2016_" + str(i) + ".json")
    DataFrameName.append("spanish_tweets_2016_" + str(i))

dfs = {}

for name, file in zip(DataFrameName, files):
    dfs[name] = pd.read_json(file)
    print(name, " done")

spanish_tweets_2016_0  done
spanish_tweets_2016_1  done
spanish_tweets_2016_2  done
spanish_tweets_2016_3  done
spanish_tweets_2016_4  done
spanish_tweets_2016_5  done
spanish_tweets_2016_6  done
spanish_tweets_2016_7  done
spanish_tweets_2016_8  done
spanish_tweets_2016_9  done
spanish_tweets_2016_10  done
spanish_tweets_2016_11  done
spanish_tweets_2016_12  done
spanish_tweets_2016_13  done
spanish_tweets_2016_14  done
spanish_tweets_2016_15  done
spanish_tweets_2016_16  done
spanish_tweets_2016_17  done
spanish_tweets_2016_18  done
spanish_tweets_2016_19  done
spanish_tweets_2016_20  done
spanish_tweets_2016_21  done
spanish_tweets_2016_22  done
spanish_tweets_2016_23  done
spanish_tweets_2016_24  done
spanish_tweets_2016_25  done
spanish_tweets_2016_26  done
spanish_tweets_2016_27  done
spanish_tweets_2016_28  done
spanish_tweets_2016_29  done
spanish_tweets_2016_30  done
spanish_tweets_2016_31  done
spanish_tweets_2016_32  done
spanish_tweets_2016_33  done
spanish_tweets_2016_34  

In [None]:
df = pd.concat(dfs.values(), ignore_index=True)
df

In [None]:
import feather

df.to_feather("../../data/df_2016.feather")
boto3.Session().resource('s3').Bucket("mt5599").Object("tweets/df_2016.feather").upload_file("../../data/df_2016.feather")