## Helper Functions and other functions not shown in the main notebook

**Convert .txt files to dataframes (which will be converted to csv afterwards**

In [1]:
def text_to_df (file_path):

    # List to keep dictionaries for each beer
    beers_dic = []

    # A temporary dictionary to store data for each beer
    current_beer = {}

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Split the line using the first colon found
            parts = line.split(':', 1)
            if len(parts) == 2:
                key = parts[0].strip()
                value = parts[1].strip()
                # Add/update the key in the current beer dictionary
                current_beer[key] = value
           # If you encounter an empty line, it signifies the end of a beer record
            if line.strip() == '':
                beers_dic.append(current_beer)
                current_beer = {}

   # Make sure to add the last beer if the file doesn't end with an empty line
    if current_beer:
        beers_dic.append(current_beer)

 # Create a DataFrame from the list of beer dictionaries
    return pd.DataFrame(beers_dic)

**Create maps using geopy**

In [2]:
def get_coordinates(country):
    # Initialize a geolocator using Nominatim with a specific user_agent
    geolocator = Nominatim(user_agent="geoapiExercices")
    try:
        # obtain the location (latitude and longitude) for the given country
        location = geolocator.geocode(country, language='en')
        return (location.latitude,location.longitude)
    except:
        return (None,None)

In [3]:
def plot_map_ratings (user_ratings):
    # Count the number of ratings for each country
    country_counts = user_ratings['country'].value_counts().reset_index()
    # Rename columns
    country_counts.columns = ['country', 'nbr_ratings']
    # Add a new column 'Coordinates'
    country_counts['Coordinates'] = country_counts['country'].apply(get_coordinates)
    # Initialize a Folium map with an initial center at latitude 0 and longitude 0
    m = folium.Map(location=[0,0],zoom_start=2)
    
    # Iterate over each row in the country_counts dataFrame
    for _, row in country_counts.iterrows():
        # Check if coordinates for the country are available
        if row['Coordinates'][0] is not None:
            # Add a Circle marker to the map for each country
            folium.Circle(
                location= row['Coordinates'],
                radius= row['nbr_ratings'],
                color= 'crimson',
                fill= True,
                fill_color= 'crimson',
                popup='{}: {} ratings'.format(row['country'], row['nbr_ratings'])
            ).add_to(m)
            
    return m

**Plot the general trends, the seasonal trends, and the noise**

In [4]:
def plot_STL(ratings_per_month, type):

    # Apply Seasonal-Trend decomposition using LOESS (STL)
    stl = STL(ratings_per_month, seasonal=13, period=12)
    result = stl.fit() # fit the model

    # Extract components from the decomposition
    trend = result.trend
    seasonal = result.seasonal
    residual = result.resid

    # Create 4 subplot figure
    plt.figure(figsize=(10, 6))

    # Subplot 1: Trend
    plt.subplot(411)
    plt.plot(trend, label='Trend', color = type)
    plt.legend(loc='best')
    plt.grid()

    # Subplot 2: Seasonality
    plt.subplot(412)
    plt.plot(seasonal,label='Seasonality', color = type)
    plt.legend(loc='best')
    plt.grid()

      # Subplot 3: Residuals
    plt.subplot(413, sharey=plt.gca())
    plt.plot(residual, label='Residuals', color=type)
    plt.legend(loc='best')
    plt.tight_layout()
    plt.grid()

    # Subplot 4: Placeholder for potential additional plots
    plt.subplot(414)
    plt.axis('off')

**Compute the proportion of number of ratings for a beer subset**

In [6]:
def proportion_nbr_ratings(df, beer_subset, date_start, date_end):
    
    """
    Given a subset of beers, a start date and end date, returns the proportion of number of ratings per month
    (i.e. number of ratings of the beer subset normalized according to the number of ratings for all beers)
    of the subset in the given period.
    
    df: global dataframe, considering all the beers
    beer_subset: subset of beers (generally a subset of df)
    date_start: first date to consider
    date_end: last date to consider
    """
    
    # filter the dataframe information from date_start to date_end
    #for all the beers
    all_beers = df[
        (df['year'] >= date_start) &
        (df['year'] <= date_end)
    ]
    
    #for the beer subset
    beer_subset = beer_subset[
        (beer_subset['year'] >= date_start) &
        (beer_subset['year'] <= date_end)
    ]

    #Define the number of ratings per month for all beers around the world
    all_beer_ratings = all_beers.groupby('year_month')["rating"].count() 
    
    #Number of ratings per month
    beer_subset_nbr_ratings_per_month = beer_subset.groupby('year_month')["rating"].count()

    #Proportion of number of ratings per month
    beer_subset_prop_nbr_ratings = beer_subset_nbr_ratings_per_month / all_beer_ratings
    
    return beer_subset_prop_nbr_ratings

**Compute the standardized ratings (the grade) for a beer subset**

In [11]:
def rates_standardized(df, beer_subset, date_start, date_end):
    
    """
    Given a subset of beers, a start date and end date, returns the standardized mean rate per month
    (i.e. z-scores), of the subset in the given period.
    """
    
    # filter the dataframe information from date_start to date_end
    all_beers = df[
        (df['year'] >= date_start) &
        (df['year'] <= date_end)
    ]
    
    beer_subset = beer_subset[
        (beer_subset['year'] >= date_start) &
        (beer_subset['year'] <= date_end)
    ]
    
    #Compute mean and variance of rate for the beer style, in the defined period
    mean_rate = beer_subset['rating'].mean()
    std_rate = beer_subset['rating'].std()

    #Mean Rate per month
    beer_subset_rate_per_month = beer_subset.groupby('year_month')["rating"].mean()

    #Proportion of number of ratings per month
    beer_subset_z_score = (beer_subset_rate_per_month - mean_rate) / std_rate
    
    return beer_subset_z_score

**Plot the seasonal trends, given roportion of ratings per month, or ratings per month...**

In [8]:
def plot_seasonal_trends(beer_feature, title, ylabel, color, month_increment=3):
    
    """
    Given a pandas Series showing the feature of a beer subset per month (e.g. rates per month)
    returns plots showing the seasonal trend for this particular feature
    
    beer_feature: pandas Series with per month values
    title: title of the plot
    ylabel: label of the y axis, depending on the chosen feature (e.g. rate, or proportion of number of ratings)
    color: plot color
    month_increment: intervals of month to display. this only affects the labels, not the computation.
    """
    
    plt.figure(figsize = (14,4))
    x = beer_feature.index.astype(str)
    plt.plot(x, beer_feature.values, marker = 'o', color = color)
    plt.xlabel('Month')
    plt.ylabel(ylabel)
    plt.title(title)

    #We show only labels by intervals of 3 months, to have a clearer visualisation 
    plt.xticks(rotation = 90, fontsize = 9)
    tick_positions = range(0, len(x), month_increment)
    plt.xticks(tick_positions, [x[i] for i in tick_positions], rotation=45)
    
    plt.grid()
    plt.show()

    #Convert the index to timestamp. A new variable is created to avoid changing the original dataframe
    beer_feature_STL = beer_feature.copy()
    beer_feature_STL.index = beer_feature_STL.index.to_timestamp()

    #Plot seasonal trends
    plot_STL(beer_feature_STL, color)

# A enlever : 

In [31]:
def plot_seasonal_pattern_nbr_ratings_rb(dataframe, beer_style, color):
    """
    
    :param dataframe: pd.DataFrame
    :param beer_style: string 
    :return: the plot of the number of ratings for a beer style between 2010 and 2016 and the STL plot
    """
    all_beer_df = dataframe[
        (dataframe['year'] >= 2010) &
        (dataframe['year'] <= 2016)
        ]
    all_beer_ratings_df = all_beer_df.groupby('year_month')["rating"].count()
    beer_style_df = all_beer_df[(all_beer_df['style'] == beer_style)]
    ratings_per_month_df = beer_style_df.groupby('year_month')["rating"].count()
    ratings_perc_df = ratings_per_month_df / all_beer_ratings_df

    #Convert the index to timestamp
    ratings_perc_df.index = ratings_perc_df.index.to_timestamp()
    plt.figure(figsize = (14,4))
    plt.plot(ratings_perc_df.index.astype(str), ratings_perc_df.values, marker = 'o', color = color)
    plt.xlabel('Month')
    plt.ylabel('% of the number of ratings relative to total number of ratings')
    plt.title(beer_style)
    plt.xticks(rotation = 90, fontsize = 9)
    plt.show()
    plot_STL(ratings_perc_df, color)

Plot seasonal pattern of the number of ratings depending on type abv

In [34]:
def plot_seasonal_pattern_abv_rb(dataframe, abv_type, color):
    """
    
    :param dataframe: pd.DataFrame
    :param abv_type: string 
    :param color: string
    :return: the plot of the number of ratings for a beer style between 2010 and 2016 and the STL plot
    """
    all_beer_df = dataframe[
        (dataframe['year'] >= 2010) &
        (dataframe['year'] <= 2016)
        ]
    all_beer_abv_df = all_beer_df.groupby('year_month')["abv"].count()
    if abv_type == 'strong':
        beer_abv_type_df = all_beer_df[(all_beer_df['abv'] >=8.)]
    elif abv_type == 'light':
        beer_abv_type_df = all_beer_df[(all_beer_df['abv'] <=5.)]
    ratings_per_month_df = beer_abv_type_df.groupby('year_month')["rating"].count()
    ratings_perc_df = ratings_per_month_df / all_beer_abv_df

    #Convert the index to timestamp
    ratings_perc_df.index = ratings_perc_df.index.to_timestamp()
    plt.figure(figsize=(14, 4))
    plt.plot(ratings_perc_df.index.astype(str), ratings_perc_df.values, marker='o', color=color)
    plt.xlabel('Month')
    plt.ylabel('% of the number of ratings relative to total number of ratings')
    plt.title(abv_type)
    plt.xticks(rotation=90, fontsize=9)
    plt.show()
    plot_STL(ratings_perc_df, color)