### Fetching the Data from website

In [1]:
import urllib.request as req
import re
import pandas as pd
import numpy as np

In [2]:
site = req.urlopen('http://berkeleyearth.lbl.gov/auto/Global/Land_and_Ocean_complete.txt')

In [3]:
contents=site.read()

In [4]:
contents = str(contents)

In [5]:
contents

'b"% This file contains a detailed summary of the changes in Earth\'s global average\\n% surface temperature estimated by combining the Berkeley Earth land-surface\\n% temperature field with a reinterpolated version of the HadSST ocean temperature \\n% field.  \\n% The current citation for this dataset is: \\n% \\n%    Rohde, R. A. and Hausfather, Z.: The Berkeley Earth Land/Ocean Temperature\\n%    Record, Earth Syst. Sci. Data, 12, 3469\\xef\\xbf\\xbd3479, \\n%    https://doi.org/10.5194/essd-12-3469-2020, 2020.\\n% \\n% The dataset differs slightly from the dataset as described in the citation as \\n% HadSST3 has been replaced with the newer HadSST4, and associated interpolation \\n% parameters have been refit accordingly.  No other changes in methods were needed \\n% when moving to the new version of HadSST. \\n% \\n% Two versions of this average are reported.  These differ in how they treat \\n% locations with sea ice.  In the first version, temperature anomalies in the \\n% prese

In [6]:
pattern_1850 = re.compile(r'1850')
pattern_2 = re.compile(r'% Global Average Temperature Anomaly with Sea Ice Temperature Inferred from Water Temperatures')
                          
matches_1850 = pattern_1850.finditer(contents)
matches_2 = pattern_2.finditer(contents)

for index,match in enumerate(zip(matches_1850,matches_2)):
    if index ==0:
        pos1 = match[0].span()
        pos2 = match[1].span()
    else:
        break

In [7]:
def raw_to_dataframe(paragraph,start_pos, end_pos):
    
    """ This is to Document for Future References.
        
        Arguments
         
         paragraph = string to fetch the dataset from.
         start_pos = starting position of the element
         end_pos = ending positiong of the dataset or the strin, if there are 2 tables in the same string ending position 
                   of only one table should be passed
         
        The Basic Structure Goes like this
         
         1.Fetching_contents -> passing contents and start and end positiong
           This function returns a list in which all the words or contents of the table is present.
         
         2.Cleaning the data -> passing the list from the last function 
           This function removes '\\n'  specifically and if you want to remove anything please specify here.
         
         3.Resizing the data -> The data would be resized to 2074 rows and 12 columns here.
           It is hard coded future resizing can be specified here.
         
         4.Create Sample dataFrame -> This step involves in transforming the values into a dataframe, type casting etc.
         
         5.Creating the Dataframe -> This should be the last step and it returns the Cleaned, processed dataframe.
    """
    def type_casting(element):
        if element=='NaN':
            return float(0)
        else:
            return float(element)
    
    def fetching_contents(start_pos,end_pos,paragraph):
        endpoint,temp = 0,0
        row=[]

        for position in range(start_pos,end_pos):

            if paragraph[position] !=' ':
                temp = position - endpoint
                endpoint=endpoint+1

            elif endpoint !=0:
                row.append(paragraph[temp:temp+endpoint])
                endpoint=0
        
        if len(row)%12 < 5:
            while len(row)%12!=0:
                row.pop()
        
        else:
            value = len(row)%12
            value_to_add = 12 - value
            deficit = len(row) +value_to_add
        
            while len(row) < deficit:
                row.append('NaN')

        return row
    
    def clean_the_data(row):
        cleaned_row = []
        for element in row:
            if '\\n' in element:
                cleaned_row.append(element[:-2])
            else:
                cleaned_row.append(element)
                
        return cleaned_row
    
    def reshape_the_data(cleaned_row):
        
        data1 = np.array(cleaned_row)
        data1 =data1.reshape(len(cleaned_row)//12,12)
        data = data1.tolist()
        
        return data
    
    def create_basic_data(data):
        df = pd.DataFrame(data, columns=['a','b','c','d','e','f','g','h','i','j','k','l'])
    
        for index,element in enumerate(df.columns):
            if index>1:
                df[element] = df[element].apply(type_casting)
            else:
                print(df)
                df[element] = df[element].apply(lambda X: int(X))
                
        return df
    
    def create_actual_dataframe(df):
        
        column_names =[["Anomaly","Uncertainty","Anomaly","Uncertainty","Anomaly","Uncertainty","Anomaly","Uncertainty","Anomaly","Uncertainty"],
               ["Monthly","Monthly","Annual","Annual","Five-year","Five-year","Ten-year","Ten-year","Twenty-year","Twenty-year"]]
        
        tuples = list(zip(column_names[1],column_names[0]))
        indexing = pd.MultiIndex.from_tuples(tuples,names=['',''])
        actual_data = pd.DataFrame(columns=indexing)

        actual_data.insert(0,column='Year',value=0)
        actual_data.insert(1,column='Month',value=0)
        
        df_columns = df.columns
        new_df_columns = actual_data.columns

        for index,column in enumerate(df_columns):
            if index <=1:
                actual_data[new_df_columns[index]] = df[column]
            
            else:

                new_df_columns[index]
                actual_data.loc[:,new_df_columns[index]] = df[column]
        return actual_data
    
    contents = fetching_contents(start_pos,end_pos,paragraph)
    cleaned_contents = clean_the_data(contents)
    resized_data = reshape_the_data(cleaned_contents)
    df = create_basic_data(resized_data)
    actual_data = create_actual_dataframe(df)
    
    return actual_data    

In [47]:
other_site = req.urlopen('http://berkeleyearth.lbl.gov/auto/Global/Complete_TAVG_complete.txt')

In [48]:
contents1=other_site.read()

In [49]:
contents1 = str(contents1)

In [50]:
pattern_1750 = re.compile(r'1750')
matches_1 = pattern_1750.finditer(contents1)

for index,match in enumerate(matches_1):
    if index==0:
        pos=match.span()

In [51]:
land_table = raw_to_dataframe(contents1, pos[0], len(contents1))

         a   b       c      d       e      f    g    h    i    j    k    l
0     1750   1  -0.803  3.878     NaN    NaN  NaN  NaN  NaN  NaN  NaN  NaN
1     1750   2  -1.249  3.825  -1.014  0.966  NaN  NaN  NaN  NaN  NaN  NaN
2     1750   3   0.151  2.661  -1.032  0.999  NaN  NaN  NaN  NaN  NaN  NaN
3     1750   4  -0.203  1.661  -1.057  0.999  NaN  NaN  NaN  NaN  NaN  NaN
4     1750   5  -1.651  3.471  -1.367  1.019  NaN  NaN  NaN  NaN  NaN  NaN
...    ...  ..     ...    ...     ...    ...  ...  ...  ...  ...  ...  ...
3270  2022   7   1.278  0.055     NaN    NaN  NaN  NaN  NaN  NaN  NaN  NaN
3271  2022   8   1.231  0.112     NaN    NaN  NaN  NaN  NaN  NaN  NaN  NaN
3272  2022   9   1.139  0.120     NaN    NaN  NaN  NaN  NaN  NaN  NaN  NaN
3273  2022  10   1.544  0.118     NaN    NaN  NaN  NaN  NaN  NaN  NaN  NaN
3274  2022  11   0.876  0.160     NaN    NaN  NaN  NaN  NaN  NaN  NaN  NaN

[3275 rows x 12 columns]
         a   b       c      d       e      f    g    h    i    j    k    l

In [52]:
land_table.head()

Unnamed: 0_level_0,Year,Month,Monthly,Monthly,Annual,Annual,Five-year,Five-year,Ten-year,Ten-year,Twenty-year,Twenty-year
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Anomaly,Uncertainty,Anomaly,Uncertainty,Anomaly,Uncertainty,Anomaly,Uncertainty,Anomaly,Uncertainty
0,1750,1,-0.803,3.878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1750,2,-1.249,3.825,-1.014,0.966,0.0,0.0,0.0,0.0,0.0,0.0
2,1750,3,0.151,2.661,-1.032,0.999,0.0,0.0,0.0,0.0,0.0,0.0
3,1750,4,-0.203,1.661,-1.057,0.999,0.0,0.0,0.0,0.0,0.0,0.0
4,1750,5,-1.651,3.471,-1.367,1.019,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
land_table.to_csv('land_temp.csv')