# Cleaning Part 1

---

Nate Bukowski

In [39]:
# imports
import pandas as pd
import numpy as np

from nltk.tokenize import RegexpTokenizer

In [40]:
master = pd.read_csv('./master_raw.csv')

In [41]:
master.head()

Unnamed: 0,DATE,V2Locations,V2Themes,V2Tone
0,20200101234500,"4#Dongxing, Jilin, China#CH#CH05#13176#43.0855...","BAN,762;MEDIA_MSM,219;WB_1921_PRIVATE_SECTOR_D...","2.27920227920228,3.7037037037037,1.42450142450..."
1,20200101234500,1#Chinese#CH#CH##35#105#CH#407;1#China#CH#CH##...,"TAX_FNCACT_EXECUTIVE_DIRECTOR,3581;WB_566_ENVI...","0.335570469798658,3.18791946308725,2.852348993..."
2,20200101234500,"4#Hanoi, Ha N?I, Vietnam, Republic Of#VM#VM44#...","TAX_FNCACT_CHILD,1184;TAX_FNCACT_CHILD,1769;TA...","-4.91978609625669,1.71122994652406,6.631016042..."
3,20200101234500,"4#Negueira, Galicia, Spain#SP#SP58#25787#43.13...","RURAL,2751;EPU_ECONOMY_HISTORIC,1930;AGRICULTU...","-1.87319884726225,1.44092219020173,3.314121037..."
4,20200101234500,"4#Peruibe, SãPaulo, Brazil#BR#BR27#11404#-24.3...","MOVEMENT_GENERAL,1613;CRISISLEX_T04_INFRASTRUC...","-4.44964871194379,1.40515222482436,5.854800936..."


In [42]:
df_list = [january, february, march, april, may, master]

for df in df_list:
    print(df.shape)

(265, 11)
(269, 11)
(238, 11)
(226, 11)
(102, 11)
(1330, 4)


In [43]:
master.isna().sum()

DATE             0
V2Locations    230
V2Themes         0
V2Tone           0
dtype: int64

In [44]:
# Function for reformatting the 'DATE' column

def reformat_date(df):
    
    # Create clean_date list of reformatted dates
    clean_date = [pd.to_datetime(str(date)[:8]) for date in df['DATE']]

    # Assign clean_date list to new 'date' column.
    df['date'] = clean_date
    
    return df

In [45]:
# Function that takes in the dataframe and creates the 'latitude' and 'longitude' columns necessary for mapping.
# The 'latitude' and 'longitude' columns contain the first set of coordinates listed in the 'V2Locations' column.

# 'V2Locations': Semicolon-delimited blocks, with pound symbol delimited fields.
# This is a list of all locations found in the text.

def lat_long(df):
    
    # Create an empty list to put the coordinates in.
    all_coordinates = []
    
    # Loop through the rows in 'V2Locations' and store all of the coordinates found in the row in a list.
    # Then append the list of row coordinates to the all_coordinates list.
    for locations in df['V2Locations']:
        row_coordinates = [location.split('#')[5:7] for location in locations.split(';')]
        all_coordinates.append(row_coordinates)       

    # Pull only the first set of coordinates from each row out of all_coordinates.
    first_coordinate_set = [coordinate[0] for coordinate in all_coordinates]

    # Create two new lists. One containing the latitudes and one containing the longitudes.
    latitude = [lat[0] for lat in first_coordinate_set]
    longitude = [long[1] for long in first_coordinate_set]

    # Create 'latitude' and 'longitude' columns.
    df['latitude'] = latitude
    df['longitude'] = longitude

    # Remove any whitespace.
    for num in df['latitude']:
        num.strip()

    for num in df['longitude']:
        num.strip()
    
    # Drop all columns with missing coordinate sets.
    df = df.loc[df['latitude'] != '']
    df = df.loc[df['longitude'] != '']
    
    
    # Cast both new columns as float dtypes.
    
    # I've tried many different ways of doing this, but none seem to work inside of the function...
    
    for lat in df['latitude']:
        try:
            float(lat)
        except ValueError:
            print(lat)
    
    for long in df['longitude']:
        try:
            float(long)
        except ValueError:
            print(long)
            
    df['latitude'] = df['latitude'].astype(float)
    df['longitude'] = df['longitude'].astype(float)

    return df

In [46]:
# Function for cleaning 'V2Themes' column.

# A complete list of possible themes can be found here: 
# https://blog.gdeltproject.org/new-august-2019-gkg-2-0-themes-lookup/

def clean_themes(df):
    
    # Create a list to store clean themes in.
    all_themes = []

    # Create a list for each row and add that list to all_themes.
    for themes in df['V2Themes']:
        row_themes = [theme.strip(',0123456789') for theme in themes.split(';')]
        all_themes.append(row_themes)

    # Assing all_themes to the new 'themes' column.
    df['themes'] = all_themes
    
    return df

In [47]:
# Function for cleaning the 'V2Tone' column. 

# The 'V2Tone' column is broken up into the following categories:
# Tone: This is the average “tone” of the document as a whole. 
# Positive Score: This is the percentage of all words in the article 
#                 that were found to have a positive emotional connotation.
# Negative Score: This is the percentage of all words in the article 
#                 that were found to have a positive emotional connotation. 
# Polarity: This is the percentage of words that had matches in the tonal dictionary 
#           as an indicator of how emotionally polarized or charged the text is.
# Activity Reference Density: This is the percentage of words that were active words offering 
#                             a very basic proxy of the overall “activeness” of the text compared 
#                             with a clinically descriptive text.
# Self/Group Reference Density: This is the percentage of all words in the article that are pronouns,
#                               capturing a combination of self-references and group-based discourse.
# Word Count: This is the total number of words in the document. 

# We will only keep Tone.

def clean_tone(df):

    # Create a list for the tone category.
    tone = [tones.split(',')[0]for tones in df['V2Tone']]
#     negative_score = [tones.split(',')[1]for tones in df['V2Tone']]
#     positive_score = [tones.split(',')[2]for tones in df['V2Tone']]
#     polarity = [tones.split(',')[3]for tones in df['V2Tone']]
#     activity_reference_density = [tones.split(',')[4]for tones in df['V2Tone']]
#     self_group_reference_density = [tones.split(',')[5]for tones in df['V2Tone']]
#     word_count = [tones.split(',')[6]for tones in df['V2Tone']]

    # Create the 'tone' column.
    df['tone'] = tone
#     df['negative_score'] = negative_score
#     df['positive_score'] = positive_score
#     df['polarity'] = polarity
#     df['activity_reference_density'] = activity_reference_density
#     df['self_group_reference_density'] = self_group_reference_density
#     df['word_count'] = word_count
    
    # Cast 'tone' as a float.
    df['tone'] = df['tone'].astype(float)
#     df['negative_score'] = df['negative_score'].astype(float)
#     df['positive_score'] = df['positive_score'].astype(float)
#     df['polarity'] = df['polarity'].astype(float)
#     df['activity_reference_density'] = df['activity_reference_density'].astype(float)
#     df['self_group_reference_density'] = df['self_group_reference_density'].astype(float)
#     df['word_count'] = df['word_count'].astype(float)
    
    return df

In [48]:
def clean_df(df):
    
    # Remove all NaNs
    df.dropna(inplace=True)
    
    # Refomat the 'DATE' column.
    reformat_date(df)
    
    # Create 'latitude' and 'longitude' columns.
    lat_long(df)  
    
    # Clean the 'V2Themes' column.
    clean_themes(df)
    
    # Clean the 'V2Tones' column.
    clean_tone(df)
    
    # Drop the old and unusable columns.
    df.drop(columns=['DATE', 'V2Locations', 'V2Themes', 'V2Tone'], inplace=True)
    
    return df

In [49]:
clean_master = clean_df(master)

In [50]:
clean_master.dtypes

date         datetime64[ns]
latitude             object
longitude            object
themes               object
tone                float64
dtype: object

In [51]:
# Cast 'latitude' column as floats.
for num in clean_master['latitude']:
    num.strip()

clean_master = clean_master.loc[clean_master['latitude'] != '']

for lat in clean_master['latitude']:
    try:
        float(lat)
    except ValueError:
        print(f'error: {lat}')

clean_master['latitude'] = clean_master['latitude'].astype(float)

# Cast 'longitude' column as floats.
for num in clean_master['longitude']:
    num.strip()

clean_master = clean_master.loc[clean_master['longitude'] != '']

for lat in clean_master['longitude']:
    try:
        float(lat)
    except ValueError:
        print(f'error: {lat}')

clean_master['longitude'] = clean_master['longitude'].astype(float)


In [52]:
clean_master.dtypes

date         datetime64[ns]
latitude            float64
longitude           float64
themes               object
tone                float64
dtype: object

In [53]:
clean_master.shape

(1100, 5)

In [54]:
clean_master.head()

Unnamed: 0,date,latitude,longitude,themes,tone
0,2020-01-01,43.0855,129.302,"[BAN, MEDIA_MSM, WB_1921_PRIVATE_SECTOR_DEVELO...",2.279202
1,2020-01-01,35.0,105.0,"[TAX_FNCACT_EXECUTIVE_DIRECTOR, WB_566_ENVIRON...",0.33557
2,2020-01-01,21.0333,105.85,"[TAX_FNCACT_CHILD, TAX_FNCACT_CHILD, TAX_FNCAC...",-4.919786
3,2020-01-01,43.1346,-6.89444,"[RURAL, EPU_ECONOMY_HISTORIC, AGRICULTURE, GEN...",-1.873199
4,2020-01-01,-24.3167,-47.0,"[MOVEMENT_GENERAL, CRISISLEX_T04_INFRASTRUCTUR...",-4.449649


In [55]:
clean_master.to_csv('./master_clean.csv', index=False)