In [None]:
## Install the needed libraries ##
%%capture
!pip install datasets
!pip install matplotlib
!pip install pandas

In [None]:
## Import the needed libraries ##
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [None]:
## Load the datasets from Huggingface ##
samromur_milljon = load_dataset("language-and-voice-lab/samromur_milljon", split="female_18to49_yrs")

# Female speakers from 18 to 49 years old (18 <= n <=49): female_18to49_yrs 

# Female speakers greater than 49 years old (n > 49): female_gt_49_yrs 

# Male speakers from 18 to 49 years old (18 <= n <=49): male_18to49_yrs 

# Male speakers greater than 49 years old (n > 49): male_gt_49_yrs 

In [None]:
## Check the column names and the number of rows ##
samromur_milljon

In [None]:
## Create a Pandas DataFrame and only include the needed variables which are 'age' and 'duration' ##
df = pd.DataFrame({
    'age': [entry['age'] for entry in samromur_milljon],
    'duration': [entry['duration'] for entry in samromur_milljon]
})

## Convert 'duration' to floats ##
df['duration'] = df['duration'].astype(float)


In [None]:
## A few recordings had an age value of '21'. They were therefore added to the '20-29' column ##
df['age'] = df['age'].replace('21', '20-29')

## Now, get the value counts of ages and sort by the age index ##
age_counts = df['age'].value_counts().sort_index()

## Calculate the percentage of recordings for each age group column ##
total_recordings = age_counts.sum()
percentages = (age_counts / total_recordings * 100).round(2)

## Plot a bar chart with ages in ascending order ##
plt.figure(figsize=(10, 6))
ax = age_counts.plot(kind='bar', edgecolor='black')
plt.title('Count of Recordings by Age - Male speakers over 49 years old')
plt.xlabel('Age')
plt.ylabel('Count of Recordings')
plt.grid(axis='y')

## Set the maximum of the y-axis to 180,000 ##
ax.set_ylim(0, 180000)

## Rotate the x-axis labels for better readability ##
plt.xticks(rotation=45)

## Add labels to the bars with count and percentage values, placing percentage directly above the bar ##
for i, count in enumerate(age_counts):
    percentage = percentages[i]
    ## Place the percentage just above the bar ##
    percentage_height = count + 0.01 * max(ax.get_ylim())
    plt.text(i, percentage_height, f"({percentage}%)", ha='center', va='bottom')
    ## Place the count above the percentage ##
    count_height = percentage_height + 0.04 * max(ax.get_ylim())
    plt.text(i, count_height, f"{count}", ha='center', va='bottom')

plt.show()

In [None]:
## Define the columns for the duration in seconds. Recordings over 8 seconds are grouped together ##
columns = [0, 1, 2, 3, 4, 5, 6, 7, 8]

# Group the columns together by duration, for example one is 0-1 seconds, another is 1-2 seconds up to 8 seconds. All recordings over 8 seconds are grouped together ##
df['duration_bin'] = pd.cut(df['duration'], columns + [float('inf')], right=False, labels=["0-1", "1-2", "2-3", "3-4", "4-5", "5-6", "6-7", "7-8", "8+"])

## Count the number of recordings for each duration column ##
duration_counts = df['duration_bin'].value_counts(sort=False)

## Calculate the percentage of recordings for each duration column ##
total_recordings = duration_counts.sum()
percentages = (duration_counts / total_recordings * 100).round(2)

## Plot the results as a bar chart ##
plt.figure(figsize=(10, 6))
ax = duration_counts.plot(kind='bar', edgecolor='black')  # Add a color for better visualization
plt.title('Count of Recordings by Duration (in seconds) - Male speakers over 49 years old')
plt.xlabel('Duration (seconds)')
plt.ylabel('Count of Recordings')
plt.xticks(rotation=0)  # Rotate x-axis labels for better readability


## Set the maximum of the y-axis to 120,000 ##
ax.set_ylim(0, 120000)

## Add labels to the bars with count and percentage values, placing percentage directly above the bar ##
for i, count in enumerate(duration_counts):
    percentage = percentages[i]
    ## Place the percentage value just above the bar ##
    percentage_height = count + 0.01 * max(ax.get_ylim())
    plt.text(i, percentage_height, f"({percentage}%)", ha='center', va='bottom')
    ## Place the total count value above the percentage value ##
    count_height = percentage_height + 0.04 * max(ax.get_ylim())
    plt.text(i, count_height, f"{count}", ha='center', va='bottom')

plt.show()

In [None]:
## Below, two graphs are merged together for better comparison and visualization. These graphs compare the structure of the datasets between the same age group but different genders ##

## Path to the graphs that are to be merged ##
graph_path_female = r'path_to_graph_female'
graph_path_male = r'path_to_graph_male'

## Load the graphs ##
graph_female = mpimg.imread(graph_path_female)
graph_male = mpimg.imread(graph_path_male)

## Create a figure and a 1x2 grid of subplots ##
fig, axs = plt.subplots(2, 1, figsize=(15, 6))

## Display the graphs ##
axs[0].imshow(graph_female)
axs[0].axis('off')

axs[1].imshow(graph_male)
axs[1].axis('off')

plt.tight_layout()  # Adjust layout to not overlap
plt.show()
