In [3]:
# Do all imports and installs here - Done
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pandas as pd
import re
import configparser
import os

In [6]:
# For production only
# worldtempe_dataset = '../../data2/GlobalLandTemperaturesByCity.csv'

worldtempe_dataset = 'GlobalLandTemperaturesByCity_part9.csv'
worldtempe_df = pd.read_csv(worldtempe_dataset,sep=",")
worldtempe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   dt                             100000 non-null  object 
 1   AverageTemperature             96552 non-null   float64
 2   AverageTemperatureUncertainty  96552 non-null   float64
 3   City                           100000 non-null  object 
 4   Country                        100000 non-null  object 
 5   Latitude                       100000 non-null  object 
 6   Longitude                      100000 non-null  object 
dtypes: float64(2), object(5)
memory usage: 5.3+ MB


In [7]:
worldtempe_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [10]:
worldtempe_df['Country'].value_counts().sort_values()

Papua New Guinea               1581
Oman                           1653
Eritrea                        1797
Djibouti                       1797
Lesotho                        1881
Swaziland                      1881
Botswana                       1881
Namibia                        1881
Congo                          1893
Central African Republic       1893
Costa Rica                     1953
Rwanda                         1965
Burundi                        1965
Guinea Bissau                  1977
Liberia                        1977
Mauritania                     1977
Qatar                          2049
Bahrain                        2049
Hong Kong                      2082
South Korea                    2097
Singapore                      2265
Cambodia                       2265
Guyana                         2277
Suriname                       2277
Mongolia                       2318
Laos                           2371
Azerbaijan                     2460
Jordan                      

In [6]:
worldtempe_df.shape

(8599212, 7)

In [13]:
worldtempe_df['Country'].value_counts().sort_values()

United States    687289
Name: Country, dtype: int64

### Cleaning Country

Filter out **'Country'** for single value "United States" and check dataframe size

In [11]:
worldtempe_df = worldtempe_df[worldtempe_df['Country']=='United States']

In [12]:
worldtempe_df.shape

(687289, 7)

### Cleaning columns with datetime datetype

In [14]:
worldtempe_df['dt_converted'] = pd.to_datetime(worldtempe_df.dt)

In [37]:
worldtempe_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165508 entries, 49236 to 8439246
Data columns (total 8 columns):
dt                               165508 non-null object
AverageTemperature               165507 non-null float64
AverageTemperatureUncertainty    165507 non-null float64
City                             165508 non-null object
Country                          165508 non-null object
Latitude                         165508 non-null object
Longitude                        165508 non-null object
dt_converted                     165508 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(5)
memory usage: 11.4+ MB


In [21]:
worldtempe_df=worldtempe_df[worldtempe_df['dt_converted']>"1960-01-01"].copy()

In [22]:
worldtempe_df.shape

(165508, 8)

In [23]:
worldtempe_df['dt_converted'].max()

Timestamp('2013-09-01 00:00:00')

In [24]:
worldtempe_df['dt_converted'].min()

Timestamp('1960-02-01 00:00:00')

In [27]:
worldtempe_df.isnull().sum()

dt                               0
AverageTemperature               1
AverageTemperatureUncertainty    1
City                             0
Country                          0
Latitude                         0
Longitude                        0
dt_converted                     0
dtype: int64

In [28]:
worldtempe_df[worldtempe_df['AverageTemperature'].isnull()]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
287781,2013-09-01,,,Anchorage,United States,61.88N,151.13W,2013-09-01


NULL value doen't impact to join data i94 immigration on year 2016 only. Keep it it is

### Cleaning combination 'City' and 'dt_converted'

In [34]:
worldtempe_df[worldtempe_df[['City','dt_converted']].duplicated()].tail()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
7148658,2013-05-01,14.309,0.331,Springfield,United States,42.59N,72.00W,2013-05-01
7148659,2013-06-01,19.313,0.353,Springfield,United States,42.59N,72.00W,2013-06-01
7148660,2013-07-01,23.629,0.447,Springfield,United States,42.59N,72.00W,2013-07-01
7148661,2013-08-01,19.579,0.336,Springfield,United States,42.59N,72.00W,2013-08-01
7148662,2013-09-01,15.883,1.368,Springfield,United States,42.59N,72.00W,2013-09-01


In [35]:
worldtempe_df[(worldtempe_df['City'] == 'Springfield') & (worldtempe_df.dt == '2013-07-01')]

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,dt_converted
7142182,2013-07-01,25.132,0.211,Springfield,United States,37.78N,93.56W,2013-07-01
7145421,2013-07-01,23.824,0.142,Springfield,United States,39.38N,89.48W,2013-07-01
7148660,2013-07-01,23.629,0.447,Springfield,United States,42.59N,72.00W,2013-07-01


Seem like temperature measured by many location of a city with not much temperature difference. We can get average temperature for each of city.

### Baseline dataframe World Teperature

In [36]:
worldtempe_df.to_csv('worldtempe_df_clean.csv', index=False)

=====================================================================================================

=====================================================================================================