In [2]:
import pandas as pd
import numpy as np
import json
from pandas import json_normalize
from dash import Dash, html, dcc
import plotly.express as px
from urllib.parse import urlparse
from langdetect import detect
from datetime import date
from datetime import timedelta


In [3]:
def group(name,date,data):
    table = data.groupby([name])['id'].count()
    df = table.to_frame().reset_index()
    df.rename(columns = {'id':'ocurrences'}, inplace = True)
    if(date != 'today'):
        others = df[df.ocurrences < 10]
        df=df.drop(others.index)
        df=df.append({name:'others','ocurrences':others.count().ocurrences},ignore_index=True)
    return df

def groupBy(name,date,data):
    df = group(name,date,data)
    df = df.sort_values('ocurrences', ascending=True)
    df.to_csv('tables/'+name+date+'ByOcurrencesPlot.csv',index=False) 

def languagueGroupBy(name,date,data,languague):
    df = group(name,date,data)
    df = df.sort_values('ocurrences', ascending=True)
    df.to_csv('tables/'+languague+'/'+name+date+'ByOcurrencesPlot.csv',index=False) 

In [4]:
#Domain Analisis 
#Dominios compartidos por canales 
def ChannelPerPost(data):#Número de Canales que se comparten por post
    table = data.groupby(['channel'])['id'].count()
    df= table.to_frame().reset_index()
    return df

def DomainsPerPost(data):#Número de dominios compartidos por post
    table = data.groupby(['url'])['id'].count()
    df= table.to_frame().reset_index()
    return df

def PostPerTwitter(data):#Número de veces que un post se comparte en telegram y acaba en twitter
    table = data.groupby(['id','date'])['TwitterId'].count()
    df = table.to_frame().reset_index()
    return df

def DomainsPerTwitter(data):#Número de veces que un dominio se comparte en telegram y acaba en twitter
    table = data.groupby(['url','date'])['TwitterId'].count()
    df = table.to_frame().reset_index()
    return df


def RetweetPerDomain(data):#Numero de retweet en función del dominio
    table = data.groupby(['url'])['retweet'].sum()
    df = table.to_frame().reset_index()
    df = df.sort_values('retweet', ascending=True)
    df.to_csv('tables/RetweetPerDomain.csv',index=False) 
    return df

def LikesPerDomain(data):#Numero de me gustas en función del dominio
    table = data.groupby(['url'])['favorites'].sum()
    df = table.to_frame().reset_index()
    df = df.sort_values('favorites', ascending=True)
    df.to_csv('tables/LikePerDomian.csv',index=False) 
    return df

def RetweetPerChannel(data):#Numero de retweet en función del dominio
    table = data.groupby(['channel'])['retweet'].sum()
    df = table.to_frame().reset_index()
    df = df.sort_values('retweet', ascending=True)
    df.to_csv('tables/RetweetPerChannel.csv',index=False) 
    return df

def LikesPerChannel(data):#Numero de me gustas en función del dominio
    table = data.groupby(['channel'])['favorites'].sum()
    df = table.to_frame().reset_index()
    df = df.sort_values('favorites', ascending=True)
    df.to_csv('tables/LikePerChannel.csv',index=False) 
    return df




In [5]:
data = pd.read_csv('dataset.csv')
print(data.count())

Unnamed: 0       18639
channel          18639
id               18639
twitter          18639
title            18639
url              18639
date             18639
text             11095
languague        18639
TwitterId         1636
TwitterText       1636
favorites         1636
name              1383
quoted_tweets     1636
retweet           1636
TwitterDate       1636
dtype: int64


In [6]:

def hostNameTranformation(data):
     for index,row in data.iterrows():
        host = row['url']
        if(not (host.startswith('www.'))):
                data['url'].loc[index] = 'www.' + host
                            
     return data
 
data = hostNameTranformation(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [7]:
#Limpiamos la base de datos, tras observar que hay campos que tienen el title=='NEWS? y el text =='NULL'
data= data.drop(data[(data['twitter'] =='no_data') & (data['title'] =='NEWS') & (data['text'].isnull())].index)
pr = data[~data['text'].str.contains('JavaScript', na = False)]
data= data.drop(data[data['text'].str.contains('JavaScript', na = False)].index)
data = data.replace({'TwitterDate': {'no_data': 0}})
#Le quitamos los segundos a las fechas
data['date_ms'] = data['date']
data['date'] = data['date'].str.slice(stop=10)
data['TwitterDate'] = data['TwitterDate'].str.slice(stop=10)
#Los convertimos en date con el formato  yyyy-mm-dd
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data['TwitterDate'] = pd.to_datetime(data['TwitterDate'], format='%Y-%m-%d')
#Completamos con los campos text vacios 
data = data.fillna({'text': data.TwitterText})#completamos con twitter siempre que se pueda
data = data.fillna({'text': data.title})#completamos con title cuando no se pueda con twitter
#Corregimos el lenguaje 
languague = []
for index,row in data.iterrows():
    try:
        languague.append(detect(row['text']))
    except:
        languague.append(0)
        
data['languague'] = languague       
print(data.count())
data.to_csv('data_bertopic3.csv')


Unnamed: 0       16242
channel          16242
id               16242
twitter          16242
title            16242
url              16242
date             16242
text             16242
languague        16242
TwitterId         1455
TwitterText       1455
favorites         1455
name              1202
quoted_tweets     1455
retweet           1455
TwitterDate       1282
date_ms          16242
dtype: int64


In [8]:
table = data.groupby(['url'])['retweet'].count()
df = table.to_frame().reset_index()
df = df.sort_values('retweet', ascending=True)
df.to_csv('media.csv',index=False) 


In [9]:
#Analisis de direccionabilidad 
#cogemos solo los datos con tweets 
df = data.loc[(data['TwitterId'] > 0)]
df = df.assign(antes = np.where((df.date >= df.TwitterDate),'Twitter','Telegram'))
print(df.loc[:, ['date', 'TwitterDate','antes']])
print(df.antes.value_counts())


            date TwitterDate     antes
1     2022-11-23  2022-11-21   Twitter
2     2022-11-23  2022-11-21   Twitter
3     2022-11-23  2022-11-21   Twitter
4     2022-11-23  2022-11-22   Twitter
5     2022-11-23  2022-11-22   Twitter
...          ...         ...       ...
12844 2022-12-14  2022-12-17  Telegram
12848 2022-12-19  2022-12-19   Twitter
12849 2022-12-19  2022-12-19   Twitter
12850 2022-12-19  2022-12-19   Twitter
12851 2022-12-19  2022-12-19   Twitter

[1455 rows x 3 columns]
Twitter     1129
Telegram     326
Name: antes, dtype: int64


In [10]:
#Telegram to Twitter
df2 = df.loc[(df['antes'] == 'Telegram')]
dmxps = DomainsPerPost(df2)
chxps = ChannelPerPost(df2)
dmxps = dmxps.sort_values('id', ascending=True)
chxps = chxps.sort_values('id', ascending=True)
dmxps.to_csv('tables/TelegramToTwitter.csv',index=False) 
chxps.to_csv('tables/ChannelsTelegramToTwitter.csv',index=False) 
#Twitter to Telegram
df2 = df.loc[(df['antes'] == 'Twitter')]
dmxps = DomainsPerPost(df2)
chxps = ChannelPerPost(df2)
dmxps = dmxps.sort_values('id', ascending=True)
chxps = chxps.sort_values('id', ascending=True)
dmxps.to_csv('tables/TwitterToTelegram.csv',index=False) 
chxps.to_csv('tables/ChannelsTwitterToTelegram.csv',index=False) 


In [11]:
#POST 
chxps = ChannelPerPost(data) 
dmxps = DomainsPerPost(data)
#DOMINIOS 
psxtw = PostPerTwitter(data)
dmxtw = DomainsPerTwitter(data)
rtxdm = RetweetPerDomain(data)
lkxdm = LikesPerDomain(data)
#CHANNELS 
rtxch = RetweetPerChannel(data)
lkxch = LikesPerChannel(data)



In [12]:
#Date of analisis 
import datetime
fecha = datetime.datetime(2023, 2, 8)

today = data.loc[(data['date'] == fecha)]

groupBy('url','today',today)
groupBy('channel','today',today)
groupBy('languague','today',today)

#lastWeek
filterDate = fecha - timedelta(days=7)
lastweek = data.loc[(data['date'] >= filterDate)
                     & (data['date'] < fecha)]
groupBy('url','lastweek',lastweek)
groupBy('channel','lastweek',lastweek)
groupBy('languague','lastweek',lastweek)

#lastMonth
filterDate = fecha - timedelta(days=30)
lastmonth = data.loc[(data['date'] >= filterDate)
                     & (data['date'] < fecha)]

groupBy('url','lastmonth',lastmonth)
groupBy('channel','lastmonth',lastmonth)
groupBy('languague','lastmonth',lastmonth)
#allDataSet
groupBy('url','all',data)
groupBy('channel','all',data)
groupBy('languague','all',data)


In [13]:
dmxps = DomainsPerPost(df)
dmxps = dmxps.sort_values('id', ascending=True)
print(dmxps)

                     url   id
33     www.elpais.com.uy    1
28          www.eitb.eus    1
29   www.eldiasegovia.es    1
102   www.sevilla.abc.es    1
34   www.elperiodico.com    1
..                   ...  ...
31        www.elmundo.es   76
54         www.gaceta.es   76
83        www.nypost.com   84
101       www.rumble.com  104
45      www.facebook.com  159

[129 rows x 2 columns]


In [14]:
#Segundo analisis descriptivo por lenguaje 
#Date of analisis 
import os 
lan = ['es','en']
path = os.getcwd() 

for x in lan:
      #We create the folders for the tables
      try:
            os.mkdir(path+'/tables/'+str(x))
      except OSError as error:
            error
            
      #We filter by  languague 
      df = data.loc[(data['languague'] == x)]
      #We filter by the date

      today = df.loc[(df['date'] == fecha)]
      languagueGroupBy('url','today',today,str(x))
      languagueGroupBy('channel','today',today,str(x))
      #lastWeek
      filterDate = fecha - timedelta(days=7)
      lastweek = df.loc[(df['date'] >= filterDate)
                        & (df['date'] < fecha)]
      languagueGroupBy('url','lastweek',lastweek,str(x))
      languagueGroupBy('channel','lastweek',lastweek,str(x))
      
      #lastMonth
      filterDate = fecha - timedelta(days=30)
      lastmonth = df.loc[(df['date'] >= filterDate)
                        & (df['date'] < fecha)]

      languagueGroupBy('url','lastmonth',lastmonth,str(x))
      languagueGroupBy('channel','lastmonth',lastmonth,str(x))

      #allDataSet
      languagueGroupBy('url','all',df,str(x))
      languagueGroupBy('channel','all',df,str(x))
