In this notebook I'll try to add some more days to the last retrieved data from some data that I retrieved when building the code few days before the final version of [notebooks/01-collecting_and_saving_tweets.ipynb](http://localhost:8888/notebooks/twitter_analysis_online_grocery_NL/notebooks/01-collecting_and_saving_tweets.ipynb).

# Load Packages

In [1]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import time

TodaysDate = time.strftime("%Y-%m-%d")

# Trying to add more days to Jumbo and AH

In [2]:
def search_file_in_folder(folder,str_file, type_file='csv'):
    """ Given a folder and a part of file's name outputs a list of files paths"""
    
    list_files_paths = []
    for file_path in glob.glob(folder+'*'+str_file+'*.'+type_file):
        try:
            list_files_paths.append(file_path)
        except:
            pass
    
    if len(list_files_paths):
        return list_files_paths
    else:
        return 'No files containing {}'.format(str_file)

In [3]:
def create_dataframe_info(result):
    """ Create a dataframe with tweet's data with path of the .csv file, min and max create_at date, 
    number of tweets, and number of columns """
    
    filepath_list = []
    min_created_list = []
    max_created_list = []
    n_tweet_list = []
    n_columns = []
    
    for file in result:
        df = pd.read_csv(file)
        df['created_at'] = pd.to_datetime(df['created_at'], infer_datetime_format=True)
        filepath_list.append(file)
        min_created_list.append(min(df['created_at']))
        max_created_list.append(max(df['created_at']))
        n_tweet_list.append(df.shape[0])
        n_columns.append(df.shape[1])
        
    dict_df = {'file_path':filepath_list,
              'min_created_list':min_created_list,
              'max_created_list':max_created_list,
              'n_tweet_list':n_tweet_list,
              'n_columns':n_columns}
    
    df_new = pd.DataFrame(dict_df)
                
    return df_new

In [None]:
folder = "../data/tweets/"
result = search_file_in_folder(folder, 'JumboSupermarkt')
result

In [None]:
df_Jumbo = create_dataframe_info(result).sort_values(by=['min_created_list','n_tweet_list'])

In [None]:
df_Jumbo.info()

If I concatenate the most recent file with any of the 3 first files displayed in the dataframe I can go back until 3rd March. One tradeoff is that we have 5 columns more in the last version than in the old one. Let's see what we can do about AH.

In [None]:
folder = "../data/tweets/"
result = search_file_in_folder(folder, 'albertheijn')
result

In [None]:
df_info_AH.info()

In [None]:
df_info_AH = create_dataframe_info(result).sort_values(by=['min_created_list','n_tweet_list'])
df_info_AH

Adding one of the 3 first files we can go back to 30th March.

So let's check the columns we need to drop and concatenate the newer and older csv to increase our range from 30th March 2020 until 22nd June 2020.

In [None]:
df_info_AH.loc[0,'file_path']

In [None]:
df_AH_2020_06_16 = pd.read_csv(df_info_AH.loc[0,'file_path'])
# before checking for difference in the columns between old and new data I'll rename handle to screen_name since both are the same
df_AH_2020_06_16.rename(columns={'handle':'screen_name'},inplace=True)
df_AH_2020_06_16.head()

In [None]:
df_AH_2020_06_16.info()

In [None]:
sum(df_AH_2020_06_16['created_at'].str.contains('2020-03'))

In [None]:
df_AH_2020_06_16['created_at'] = pd.to_datetime(df_AH_2020_06_16['created_at'], infer_datetime_format=True)

In [None]:
min(df_AH_2020_06_16['created_at']),max(df_AH_2020_06_16['created_at'])

In [None]:
df_AH_2020_06_22 = pd.read_csv(df_info_AH.loc[10,'file_path'])
df_AH_2020_06_22.head()

In [None]:
df_AH_2020_06_22.info()

In [None]:
df_AH_2020_06_22['created_at'] = pd.to_datetime(df_AH_2020_06_22['created_at'], infer_datetime_format=True)

In [None]:
df_AH_2020_06_22.info()

In [None]:
common_columns = list(set(df_AH_2020_06_16.columns).intersection(set(df_AH_2020_06_22.columns)))
common_columns.sort()
common_columns

In [None]:
# this agrees with what we expected
len(common_columns)

The columns we will miss by concatenation the older and newer columns are:

In [None]:
list(set(df_AH_2020_06_22.columns).difference(set(df_AH_2020_06_16.columns)))

Since I'd like to explore this data as well as use to compare all 3 (online) supermarkets.

`language` we need to deal with it because it will be important for the sentiment analysis. For now, I'll add this to the older data with NaN and then I'll try to label it.

In [None]:
# Adding 'language' column to df_AH_2020_06_16

df_AH_2020_06_16['language'] = np.nan

In [None]:
df_AH_2020_06_16.head()

In [None]:
# update common_columns

common_columns = list(set(df_AH_2020_06_16.columns).intersection(set(df_AH_2020_06_22.columns)))
common_columns.sort()
common_columns

In [None]:
len(common_columns)

In [None]:
list(set(df_AH_2020_06_22.columns).difference(set(df_AH_2020_06_16.columns)))

Now we have 15 columns and we can verify that language is also present in the older data dataframe. Let's concatenate both dataframes and try to deal with the `language` problem.

In [None]:
df_AH_2020_06_16.info(null_counts=True)

In [None]:
df_AH_2020_06_16 = df_AH_2020_06_16[df_AH_2020_06_16['created_at'] <= min(df_AH_2020_06_22['created_at'])]

In [None]:
df_AH_2020_06_16.info()

In [None]:
min(df_AH_2020_06_16['created_at']),max(df_AH_2020_06_16['created_at'])

In [None]:
min(df_AH_2020_06_22['created_at'])

In [None]:
# concatenate dataframes
df_AH_concat = pd.concat([df_AH_2020_06_16,df_AH_2020_06_22[common_columns]])

In [None]:
# eliminate duplicates based on create_at and text, keep will be setted to 'last' since we know that in the older 
# data language will be nan and it is better to keep data that is not nan

df_AH_concat = df_AH_concat.loc[df_AH_concat.astype(str).drop_duplicates(subset=['created_at','tweet_id','text']).index]

# sorting by 'created_at'
df_AH_concat.sort_values(by='created_at',inplace = True)

# reset index
df_AH_concat.reset_index(drop = True, inplace = True)

# save in csv

df_AH_concat.to_csv("../data/processed/AH_concat_16_and_22_June_"+TodaysDate+".csv", index = False)

In [None]:
df_test = pd.read_csv("../data/processed/AH_concat_16_and_22_June_"+TodaysDate+".csv")

In [None]:
df_test.info(null_counts=True)

In [None]:
df_test.head()

In [None]:
df_test.tail()

# Inserting language missing data