In [1]:
!pip install --upgrade pyarrow
!pip install -U plotly
!pip install -U kaleido

Collecting pyarrow
  Downloading pyarrow-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25.6 MB)
[K     |████████████████████████████████| 25.6 MB 1.4 MB/s 
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 3.0.0
    Uninstalling pyarrow-3.0.0:
      Successfully uninstalled pyarrow-3.0.0
Successfully installed pyarrow-6.0.1
Collecting plotly
  Downloading plotly-5.4.0-py2.py3-none-any.whl (25.3 MB)
[K     |████████████████████████████████| 25.3 MB 22.1 MB/s 
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
  Attempting uninstall: plotly
    Found existing installation: plotly 4.4.1
    Uninstalling plotly-4.4.1:
      Successfully uninstalled plotly-4.4.1
Successfully installed plotly-5.4.0 tenacity-8.0.1
Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[K     |█████████████████████

In [2]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import numpy as np
import matplotlib.pyplot as plt
import sys
from IPython.display import clear_output
import plotly.express as px
import random

# Define usefull tools for our Data Analysis



In [3]:
# We want to fix colors for some labels because that it will be easy to compare different graphs.
def process_label(mylabels):
  """
  That function fix colors for some labels.
  Input :
      mylabels : List[string].
  Output:
      color_list: List[string].
          List of colors for each label.
  """
  #List of important labels.
  key_labels = np.array(['obituaries', 'politics', 'sports',  'us', 'nyregion', 'opinion', 'review',
       'design', 'books', 'europe', 'magazine', 'asia', 'television', 'sunday', 'business'] )
  key_colors = ['rgb(255, 127,0)', 'violet', 'lightslategrey', 'pink', 'skyblue', 'gold', 'navy', 'darkgoldenrod', 'rosybrown', 'slateblue', 'mediumturquoise', 
                'forestgreen', 'tomato', 'rosybrown', 'firebrick', 'lime', 'gold', 'powderblue', 'brown', 'orchid']
  color_list = []
  mylabels_result = list(mylabels)
  for i, label in enumerate(mylabels):
      index = np.where(key_labels == label)[0]  
      if len(index) != 0:
        color_list += [key_colors[index[0]]]
      else:
        #if label not in key_labels, we will generate color randomly.
        color_list += ["rgb({}, {}, {})".format(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))]
  return color_list

    
# Sometimes we need to plot a pie chart for distribution comparison.
def plot_topics_pie_chart(nytimes_subject, feature_name, n_features = 10, title = None):
    """
    That function plot piechart for nytimes_subject[feature_name] distribution.
    Input :
        nytimes_subject : pandas.dataframe.
        feature_name : string
            Dataframe label.
        n_features : int
            Number of classes in pie chart. As a default it will be 10 most popular classes.
        title : string
            Title for plot.
    """
    if title is None:
      title = feature_name.capitalize()
    
    #Not every link is suitable for our template, so we are deleting all topics with less than 400, 
    #because we assume that these links were parsed incorrectly.
    nytimes_subject = nytimes_subject.loc[~nytimes_subject['topic'].isin(nytimes_subject['topic'].value_counts().keys()[nytimes_subject['topic'].value_counts() < 400])]
    font_size = 14

    plt.rcParams.update({'font.size': font_size})

    # Extract <n_features> most popular classes.
    y = np.array(nytimes_subject[feature_name].value_counts()[:n_features].values)
    mylabels = nytimes_subject[feature_name].value_counts()[:n_features].keys()
    mylabels = list(mylabels)

    d = {'Feature': mylabels, 'Amount': y}
    df = pd.DataFrame(data=d)
    fig = px.pie(df, values='Amount', names='Feature', labels = mylabels)
    fig.update_traces(textposition='inside', textinfo='percent+label', showlegend=False)

    fig.update_layout(
        width=430,
        height=430,
        font=dict(size=font_size),
        title={
        'text': f'{title.capitalize()} distribution',
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        showlegend=False)
        

    fig.write_html(f'{feature_name}_proportion.html')
    fig.show()


def plot_topics_bar_chart(key_list, nytimes_subject, feature_name, other_width = 860, plot_other = True):
    """
    That function plot topics barchart for every class in  distribution.
    Input :
        key_list : List[str]
            List of classes for ploting topics barchart for every class in key_list.
        nytimes_subject : pandas.dataframe.
        feature_name : string
            Dataframe label.
        plot_other : bool, default True
            Flag that shows should function plot barchart of all classes not included in key_list.
        other_width : int, default 860
            Size of plot for barchart of all classes not included in key_list.
    """

    
    #Not every link is suitable for our template, so we are deleting all topics with less than 400, 
    #because we assume that these links were parsed incorrectly.
    nytimes_subject = nytimes_subject.loc[~nytimes_subject['topic'].isin(nytimes_subject['topic'].value_counts().keys()[nytimes_subject['topic'].value_counts() < 400])]
    font_size = 14

    plt.rcParams.update({'font.size': font_size})
    for key in key_list:
        
        y = np.array(nytimes_subject[nytimes_subject[feature_name] == key]['topic'].value_counts()[:10].values)
        mylabels = nytimes_subject[nytimes_subject[feature_name] == key]['topic'].value_counts()[:10].keys()
        colors = process_label(mylabels)
        mylabels = list(mylabels)

        d = {'Topics': mylabels, 'Amount': y}
        df = pd.DataFrame(data=d)
        fig = px.bar(df, x='Amount', y='Topics', color="Topics", orientation="h",
             color_discrete_sequence=colors
            )
        fig.update_layout(
        width=430,
        height=400,
        font=dict(size=font_size),
        title={
        'text': key.capitalize(),
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        showlegend=False)
        

        fig.write_html(f'{feature_name}_{key}.html')
        fig.show()
    
    
    if not plot_other:
      return
    
    
    if len(key_list) == 1:
      y = np.array(nytimes_subject[nytimes_subject[feature_name] != key_list[0]]['topic'].value_counts()[:10].values)
      mylabels = nytimes_subject[nytimes_subject[feature_name] != key_list[0]]['topic'].value_counts()[:10].keys()
    else:
      y = np.array(nytimes_subject[(nytimes_subject[feature_name] != key_list[0]) & ((nytimes_subject[feature_name] != key_list[1]))]['topic'].value_counts()[:10].values)
      mylabels = nytimes_subject[(nytimes_subject[feature_name] != key_list[0]) & ((nytimes_subject[feature_name] != key_list[1]))]['topic'].value_counts()[:10].keys()    
    colors = process_label(mylabels)
    
    mylabels = list(mylabels)
    d = {'Topics': mylabels, 'Amount': y}
    df = pd.DataFrame(data=d)
    fig = px.bar(df, x='Amount', y='Topics', color="Topics", orientation="h",
             color_discrete_sequence=colors)
    fig.update_layout(
        width=other_width,
        height=400,
        font=dict(size=font_size),
        title={
        'text': 'Others',
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        showlegend=False)
    fig.write_html(f'{feature_name}_other.html')
    fig.show()

## Read our data

In [4]:
nytimes_subject = pd.read_csv('../data/ada_gender_df.csv')
feature_name = 'gender'

#### Plot Gender distribution.

In [5]:
plot_topics_pie_chart(nytimes_subject, feature_name, n_features = 2)

#### Plot topics distribution for males and females.

In [6]:
plot_topics_bar_chart(['male', 'female'], nytimes_subject, feature_name, plot_other = False)

### Let's do the same for ethnic group data.

In [7]:
nytimes_subject = pd.read_csv('../data/ada_ethnic_group_df.csv')
feature_name = 'ethnic_group'

In [8]:
plot_topics_pie_chart(nytimes_subject, feature_name, n_features = 10)

In [9]:
plot_topics_bar_chart(['Italian Argentines', 'Gujarati people'], nytimes_subject, feature_name, plot_other = True, other_width = 430)