In [4]:
from sklearn.decomposition import PCA, TruncatedSVD
from scipy import sparse
import pickle
import pandas as pd
import numpy as np

from Constants import *
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances

### Load data

In [5]:
newspaper_speaker_tfidf = sparse.load_npz(FILE_NEWSPAPER_SPEAKER_TFIDF)
newspaper_speaker_tfidf = newspaper_speaker_tfidf.toarray()

newspaper_token_tfidf = sparse.load_npz(FILE_NEWSPAPER_TOKEN_TFIDF)
newspaper_token_tfidf = newspaper_token_tfidf.toarray()

In [6]:
with open(PICKLE_NEWSPAPER_TO_INDEX, 'rb') as handle:
    newspaper_to_index = pickle.load(handle)

with open(PICKLE_INDEX_TO_NEWSPAPER, 'rb') as handle:
    index_to_newspaper = pickle.load(handle)
    
    
with open(PICKLE_SPEAKER_TO_INDEX, 'rb') as handle:
    speaker_to_index = pickle.load(handle)

with open(PICKLE_INDEX_TO_SPEAKER, 'rb') as handle:
    index_to_speaker = pickle.load(handle)
    
with open(PICKLE_INDEX_TO_TOKEN, 'rb') as handle:
    index_to_token = pickle.load(handle)

with open(PICKLE_TOKEN_TO_INDEX, 'rb') as handle:
    token_to_index = pickle.load(handle)
    
newspaper_owner = pd.read_csv(FILE_NEWSPAPER_OWNER)

### Select newspaper that are owned by a group with multiple newspaper

In [7]:
#Minimum number of newspaper needed in a group to consider that group
MIN_NEWSPAPER_COUNT = 4

newspaper_with_owner = newspaper_owner[~newspaper_owner["owner"].isnull()]

print(f'We sucessfully retrieved: {len(newspaper_with_owner) / len(newspaper_owner) * 100}% of the data')

num_np_by_owner = newspaper_with_owner.groupby("owner").count()["newspaper"]

group_owner = set(num_np_by_owner[num_np_by_owner >= MIN_NEWSPAPER_COUNT].index)

newspaper_in_group = newspaper_with_owner[newspaper_with_owner["owner"].isin(group_owner)]

newspaper_in_group

We sucessfully retrieved: 13.61043194784026% of the data


Unnamed: 0,newspaper,QID,website,owner
2,1045theteam,Q7956286,http://www.1045theteam.com,Townsquare Media
4,107jamz,Q6331512,http://www.107jamz.com,Townsquare Media
5,10news,Q3191396,http://www.10news.com/,E. W. Scripps Company
8,13abc,Q2386816,http://www.13abc.com/,Gray Television
16,247sports,Q16973497,https://247sports.com/,ViacomCBS Streaming
...,...,...,...,...
7269,www,Q150248,https://www.metacritic.com/,ViacomCBS Streaming
7285,wxyz,Q3564870,http://www.wxyz.com/,E. W. Scripps Company
7293,wyrk,Q7958139,http://www.wyrk.com,Townsquare Media
7310,yardbarker,Q8049272,http://www.yardbarker.com/,Fox Entertainment Group


In [8]:
newspaper_in_group['owner'] = pd.Categorical(newspaper_in_group.owner)
newspaper_in_group['owner_ID'] = newspaper_in_group['owner'].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newspaper_in_group['owner'] = pd.Categorical(newspaper_in_group.owner)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newspaper_in_group['owner_ID'] = newspaper_in_group['owner'].cat.codes


In [9]:
newspaper_in_group

Unnamed: 0,newspaper,QID,website,owner,owner_ID
2,1045theteam,Q7956286,http://www.1045theteam.com,Townsquare Media,38
4,107jamz,Q6331512,http://www.107jamz.com,Townsquare Media,38
5,10news,Q3191396,http://www.10news.com/,E. W. Scripps Company,10
8,13abc,Q2386816,http://www.13abc.com/,Gray Television,18
16,247sports,Q16973497,https://247sports.com/,ViacomCBS Streaming,41
...,...,...,...,...,...
7269,www,Q150248,https://www.metacritic.com/,ViacomCBS Streaming,41
7285,wxyz,Q3564870,http://www.wxyz.com/,E. W. Scripps Company,10
7293,wyrk,Q7958139,http://www.wyrk.com,Townsquare Media,38
7310,yardbarker,Q8049272,http://www.yardbarker.com/,Fox Entertainment Group,12


In [10]:
print(f'We obtain {newspaper_in_group["owner"].nunique()} different groups')

We obtain 46 different groups


### Project speaker TFIDF in 2D using PCA /TSNE an create a map

In [11]:
def project_PCA(selected_newspapers, newspaper_to_index, X, dim=2):
    newspaper_in_group_idx = [newspaper_to_index[newspaper] for newspaper in selected_newspapers]
    selected_X = X[newspaper_in_group_idx]
    
    pca = PCA(n_components=dim)
    Y = pca.fit_transform(selected_X)
    return Y

In [12]:
def project2D_TSNE(selected_newspapers, newspaper_to_index, X, dim=2):
    newspaper_in_group_idx = [newspaper_to_index[newspaper] for newspaper in selected_newspapers]
    selected_X = X[newspaper_in_group_idx]
    
    distance_matrix = pairwise_distances(selected_X, metric='cosine')
    embeddings = TSNE(n_components=dim,metric='precomputed',square_distances=True)
    Y_tsne = embeddings.fit_transform(distance_matrix)
    
    return Y_tsne

### Bokeh plot of the above map

In [13]:
import bokeh
import bokeh.plotting as bpl
from bokeh.plotting import figure, output_file, show, save
from bokeh.palettes import Turbo256, Plasma, cividis, magma
from bokeh.models import LabelSet,Scatter,HoverTool, WheelZoomTool, PanTool, BoxZoomTool, ResetTool, TapTool, SaveTool, OpenURL
from bokeh.models.callbacks import CustomJS
from bokeh.transform import factor_cmap
bokeh.__version__

'2.4.2'

In [14]:
def get_mean_distance_to_center(Y, newspaper_in_group):
    df = pd.DataFrame(
        {
            "x": Y[:, 0],
            "y": Y[:, 1],
            "group": newspaper_in_group.owner
        }
    )
    
    mean_per_group = df.groupby("group").mean()
    df = df.join(mean_per_group,on ="group", rsuffix="_mean")
    df["diff"] = np.abs(df["x"] - df["x_mean"]) + np.abs(df["y"] - df["y_mean"])
    return df.groupby("group")["diff"].mean().sort_values()

In [21]:
def plot_scatter_bokeh(Y, newspaper_in_group, visible_by_default = False):
    df = pd.DataFrame(
        {
            "x": Y[:, 0],
            "y": Y[:, 1],
            "group": newspaper_in_group.owner,
            "newspaper": newspaper_in_group.newspaper,
            "QID": newspaper_in_group.QID,
            "website": newspaper_in_group.website
        }
    )
    
    top_culster_group = set(get_mean_distance_to_center(Y, newspaper_in_group).head(10).index)

    # Show info on hover
    hover = HoverTool(tooltips=[('Newspaper', '@newspaper'),('Group name', '@group'),('QID', '@QID'),('Website', '@website')])

    tools = [hover, WheelZoomTool(), PanTool(), BoxZoomTool(), ResetTool(), SaveTool(), TapTool()]

    groups = df['group'].unique()

    color_map = factor_cmap("group", factors=sorted(groups),palette=magma(len(groups)))

    # create figure and plot
    p = bpl.figure(height=1000, width=1500, tools=tools)

    for group_name,points in df.groupby('group'):

        source = bpl.ColumnDataSource(points)

        # Display newspaper
        scat = p.scatter(x='x', y='y',
                  fill_color= color_map,
                  line_color = color_map,
                  legend_label=group_name, source=source)

        

        if not visible_by_default:
            scat.visible = group_name in top_culster_group
        else:
            scat.visible = True
        

        # Open website on click
        source.selected.js_on_change('indices', CustomJS(args=dict(s1=source), code="""
            const inds = cb_obj.indices;
            for (var i = 0; i < inds.length; i++) {
                var ind = inds[i]
                var url = s1.data['website'][ind]
                window.open(url)
        }
            """)
        )
            
    # Show newspaper of the group when clic on the legend
    p.legend.click_policy="hide"

    # Define legend size
    p.legend.label_text_font_size = '16px'
    #Padding on the overall rectangle
    p.legend.padding = 0
    p.legend.spacing = 0
    #Size of circular points
    p.legend.glyph_height = 16
    p.legend.glyph_width = 16
    #Padding between label entries
    p.legend.label_height = 0
    p.legend.label_width = 0
    
    # Generate the graph
    bpl.show(p)

### PCA

In [22]:
Y =  project_PCA(newspaper_in_group['newspaper'], newspaper_to_index, newspaper_speaker_tfidf)
p = plot_scatter_bokeh(Y,newspaper_in_group)
output_file('newspaper_speaker_PCA.html',mode='inline')
try: 
    save(p)
except:
    pass

In [17]:
Y =  project_PCA(newspaper_in_group['newspaper'], newspaper_to_index, newspaper_token_tfidf)
p = plot_scatter_bokeh(Y,newspaper_in_group)
output_file('newspaper_token_PCA.html',mode='inline')
try: 
    save(p)
except:
    pass

### SVD

In [18]:
import scipy
from sklearn.utils.extmath import randomized_svd
from scipy.sparse.linalg import svds

In [33]:
def project_SVD(selected_newspapers, newspaper_to_index, X, dim=2):
    
    newspaper_in_group_idx = [newspaper_to_index[newspaper] for newspaper in selected_newspapers]
    selected_X = X[newspaper_in_group_idx]
    
    #Same as TruncatedSVD
    U, sigmas, VT = randomized_svd(selected_X, n_components=dim)
    
    X_red = U@np.diag(sigmas)
    
    return X_red, VT

def get_top_axis(VT,index_map,top_num=5):
    
    for i in range(VT.shape[0]):
        max_indices = np.argsort(-VT[i])[:top_num]
        print(f'\nThe maximum values for axis {i} are:\n')
        for index in max_indices:
            print(f'{index_map[index]}:{VT[i,index]:.4f}')
        
        
        min_indices = np.argsort(VT[i])[:top_num]
        print(f'\nThe minimum for axis {i} are:\n')
        for index in min_indices:
            print(f'{index_map[index]}:{VT[i,index]:.4f}')

#### Newspaper speaker

In [34]:
X_red,VT = project_SVD(newspaper_in_group['newspaper'], newspaper_to_index, newspaper_speaker_tfidf,dim=5)
get_top_axis(VT,index_to_speaker,top_num=10)
#Choose wanted axis to plot 
p = plot_scatter_bokeh(X_red[:,1:3],newspaper_in_group)


The maximum values for axis 0 are:

president donald trump:0.3809
boris johnson:0.3502
matt hancock:0.2453
jose mourinho:0.1696
nicola sturgeon:0.1390
pep guardiola:0.1381
rishi sunak:0.1328
tedros adhanom ghebreyesus:0.1295
leo varadkar:0.1273
frank lampard:0.1232

The minimum for axis 0 are:

 palmer:-0.0000
 rowdy  roddy piper:-0.0000
 superstar  billy graham:-0.0000
 yoshida:-0.0000
justin gamble:0.0000
marc horowitz:0.0000
marc huber:0.0000
marc humbert:0.0000
marc j cohen:0.0000
marc janssen:0.0000

The maximum values for axis 1 are:

president donald trump:0.5393
joe biden:0.2080
bernie sanders:0.1765
andrew cuomo:0.1672
anthony fauci:0.1624
elizabeth warren:0.1254
nancy pelosi:0.1162
gavin newsom:0.1128
pete buttigieg:0.1019
mitch mcconnell:0.0903

The minimum for axis 1 are:

boris johnson:-0.1944
matt hancock:-0.1795
jose mourinho:-0.1259
pep guardiola:-0.1019
nicola sturgeon:-0.1006
rishi sunak:-0.0975
leo varadkar:-0.0915
frank lampard:-0.0913
mary lou mcdonald:-0.0888
jur

##### Speaker clustering analysis

Axis 0:
- Max: Frequent speakers
- Min: Non frequent speakers (rare)

Axis 1:
- Max: US speakers
- Min: UK speakers

Axis 2:
- Max: Australian speakers
- Min: US speakers

Axis 3:
- Max: Canadian speakers
- Min: US politicians speakers

Axis 4:
- Max: Basketball speakers
- Min: US politicians speakers

#### Newspaper token

In [35]:
X_red,VT = project_SVD(newspaper_in_group['newspaper'], newspaper_to_index, newspaper_token_tfidf,dim=5)
get_top_axis(VT,index_to_token,top_num=10)
#Choose wanted axis to plot 
p = plot_scatter_bokeh(X_red[:,1:3],newspaper_in_group)


The maximum values for axis 0 are:

people:0.2289
going:0.1894
get:0.1778
time:0.1730
like:0.1615
think:0.1614
one:0.1443
would:0.1341
know:0.1321
really:0.1315

The minimum for axis 0 are:

AAAAA:-0.0000
AAAAAA:-0.0000
AAAAAAA:-0.0000
AAAAHHHH:-0.0000
AAAP:-0.0000
AAB:-0.0000
NurtureCloud:0.0000
golem:0.0000
goldspot:0.0000
Nurul:0.0000

The maximum values for axis 1 are:

game:0.3144
player:0.2493
play:0.1854
team:0.1756
good:0.1404
got:0.1291
club:0.1249
guy:0.1165
season:0.1103
really:0.1066

The minimum for axis 1 are:

people:-0.1950
state:-0.1505
health:-0.1160
community:-0.1089
Trump:-0.1088
president:-0.1086
country:-0.0935
public:-0.0904
need:-0.0860
COVID:-0.0854

The maximum values for axis 2 are:

UK:0.2052
NHS:0.1650
club:0.1601
Government:0.1533
player:0.1183
need:0.1105
support:0.1090
Scotland:0.1005
advice:0.0796
Ireland:0.0763

The minimum for axis 2 are:

like:-0.2302
guy:-0.1597
going:-0.1363
song:-0.1225
music:-0.1170
know:-0.1135
lot:-0.1000
really:-0.0985
kind:-

##### Tokens clustering analysis

Axis 0:
- Max: Frequent words
- Min: Non frequent words

Axis 1:
- Max: Sport
- Min: US health

Axis 2:
- Max: UK sport/politic
- Min: Music/Joy: garbage

Axis 3:
- Max: Music
- Min: Sport

Axis 4:
- Max: Canada
- Min: US/UK : garbage