# Frequency Analysis for South Florida (April 25th - September 15th)

NB: This ipynb complements our blog post published in our [website]()

In order to run this script correctly, run every one of the steps once at a time and wait that the * symbol has turned into a number.

__Authors:__

* Susanna Allés Torrent, [susanna_alles@miami.edu](mailto:susanna_alles@miami.edu), University of Miami
* Dieyun Song, [dxs1138@miami.edu](mailto:dxs1138@miami.edu), University of Miami 
* Jerry Bonnell, [j.bonnell@miami.edu](mailto:j.bonnell@miami.edu), University of Miami



## Setting up

In [234]:
import numpy as np
import pandas as pd
from coveet import count_ngrams, uniq_vocab_by_group
from collections import Counter
from pprint import pprint
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', None)
pd.options.mode.chained_assignment = None

# April 25 - May 15th

In [2]:
!python3 coveet.py query -g fl -l en es -d 2020-04-25 2020-05-15 

Namespace(all=False, date=[datetime.datetime(2020, 4, 25, 0, 0), datetime.datetime(2020, 5, 15, 0, 0)], func=<function handle_query at 0x7f812a56d550>, geo=['fl'], lang=['en', 'es'], search=None)
wrote df to dhcovid_2020-4-25_2020-5-15_en_es_fl.csv 🎉


In [370]:
df = pd.read_csv('dhcovid_2020-4-25_2020-5-15_en_es_fl.csv', index_col=0)

In [371]:
!python3 coveet.py tidy -file dhcovid_2020-4-25_2020-5-15_en_es_fl.csv -stopwords ../stopwords/stopwords_en.txt ../stopwords/stopwords_es.txt ../stopwords/stopwords_hashtags.txt

Namespace(file='dhcovid_2020-4-25_2020-5-15_en_es_fl.csv', func=<function handle_tidy at 0x7fa384255670>, lemmatize=False, search=None, stopwords=['../stopwords/stopwords_en.txt', '../stopwords/stopwords_es.txt', '../stopwords/stopwords_hashtags.txt'])
wrote tidied df to dhcovid_2020-4-25_2020-5-15_en_es_fl_stopworded.csv 🎉


## Top words (04/25-05/15)

In [372]:
df = pd.read_csv('dhcovid_2020-4-25_2020-5-15_en_es_fl_stopworded.csv', index_col=0) # run from here using this file
df['date'] = pd.to_datetime(df['date'])
df['text'] = df['text'].str.split()
df['hashtags'] = df['hashtags'].str.split()
top_n = 100  # top 100 words
n_gram = 1  # ask for word frequency (i.e. an n-gram where n=1)
col_name = 'text'  # set to either 'text' or 'hashtags'
df = df.dropna(subset=[col_name])
counts = {(g, l): Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n)
          for (g, l), t in df.groupby(["geo", "lang"])[col_name]}
# pprint(counts)

Table:

In [373]:
tab_df_data = {} #write it just once
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [374]:
pd.set_option('display.max_rows', 100)

In [375]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,pandemic,1138,0.07
1,coronavirus,871,0.05
2,people,846,0.05
3,florida,702,0.04
4,help,695,0.04
5,trump,625,0.04
6,cases,612,0.04
7,health,574,0.03
8,time,515,0.03
9,home,511,0.03


In [376]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,casos,527,0.11
1,pandemia,379,0.08
2,coronavirus,279,0.06
3,muertes,257,0.05
4,nuevos,224,0.05
5,eeuu,221,0.05
6,pais,218,0.04
7,florida,215,0.04
8,contagios,183,0.04
9,salud,183,0.04


## Top 50 hashtags (04/25-05/15)

In [378]:
top_n = 50  # top 30 words
n_gram = 1  # ask for word frequency (i.e. an n-gram where n=1)
col_name = 'hashtags'  # set to either 'text' or 'hashtags'
df_text = df.dropna(subset=[col_name])
counts = {(g, l): Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n)
          for (g, l), t in df_text.groupby(["geo", "lang"])[col_name]}
##pprint(counts)

Table: 

In [379]:
tab_df_data = {} #write it just once
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [380]:
pd.set_option('display.max_rows', 100)

In [381]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,#health,213,0.01
1,#acscovid19,171,0.01
2,#givingtuesdaynow,146,0.01
3,#miami,107,0.01
4,#pandemic,88,0.01
5,#stayhome,77,0.0
6,#360wisemedia,77,0.0
7,#florida,70,0.0
8,#news,67,0.0
9,#staysafe,59,0.0


In [382]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,#tvv,309,0.06
1,#cambioweb,298,0.06
2,#tvvnoticias,285,0.06
3,#evnews,189,0.04
4,#internacionales,187,0.04
5,#florida,107,0.02
6,#nacionales,105,0.02
7,#miami,86,0.02
8,#eeuu,73,0.02
9,#cubanosporelmundo,66,0.01


## Unique 50 hashtags (04/25-05/15)

In [390]:
col_name = "hashtags"
df = df.dropna(subset=[col_name])
grouping = df.groupby(["geo", "lang"])[col_name]
vocab_dic = uniq_vocab_by_group(grouping)  # unique vocab dictionary 
df[col_name] = df.apply(lambda x : set(x[col_name]) & vocab_dic[(x['geo'], x['lang'])],
                        result_type='reduce', axis=1)  # filtering tweets by that dic
counts = {k: Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n) for k, t in grouping}
#pprint(counts)

Table:

In [387]:
tab_df_data = {} #write it just once
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [388]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,#health,213,0.05
1,#acscovid19,171,0.04
2,#givingtuesdaynow,146,0.03
3,#360wisemedia,77,0.02
4,#news,67,0.01
5,#breaking,49,0.01
6,#trumpvirus,45,0.01
7,#ai,42,0.01
8,#hr,40,0.01
9,#fortlauderdale,38,0.01


In [389]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,#tvv,309,0.14
1,#cambioweb,298,0.14
2,#tvvnoticias,285,0.13
3,#evnews,189,0.09
4,#internacionales,187,0.09
5,#nacionales,105,0.05
6,#eeuu,73,0.03
7,#cubanosporelmundo,66,0.03
8,#opinion,47,0.02
9,#miamimundo,39,0.02


## Concordances (04/25-05/15)

In [298]:
df_concord = pd.read_csv('dhcovid_2020-4-25_2020-5-15_en_es_fl.csv', index_col=0)
df_concord = df_concord.dropna(subset=["text"])

In [299]:
pd.set_option('display.max_rows', df_concord.shape[0]+1)

In [300]:
# df_concord.to_csv("send-results-to-a.csv")

In [301]:
filt = lambda text: 'impact' in text  # a single word
#filt = lambda text: 'college' in text and 'party' in text      # a logical expression 
# filt = lambda text: 'trump' in text and not 'china' in text    # another one to try
df_concord = df_concord[df_concord.apply(lambda x: filt(x["text"]), axis=1)]
df_concord 

Unnamed: 0,date,lang,geo,text,hashtags
269,2020-04-26,es,fl,nueva york el mas impactado por la pandemia de covid19 en eeuu registro este domingo su cifra de fallecidos diarios mas baja del mes 367 nuevos decesos 70 menos que el dia anterior y descensos en las hospitalizaciones intubaciones y positivos,#26abr #tvvnoticias
395,2020-04-26,es,fl,argelia decidio relajar el confinamiento y autorizar la apertura de los comercios para atajar el impacto de la pandemia del covid19 en su fragil economia sumida en una aguda crisis por el derrumbe de los precios del petroleo y el gas materias primas de las que depende,
460,2020-04-27,es,fl,que impacto podria tener covid19 en el valor de las viviendas,
581,2020-04-27,es,fl,que impacto podria tener covid19 en el valor de las viviendas,
588,2020-04-27,es,fl,que impacto podria tener covid19 en el valor de las viviendas,#findyourdreamhome #realtor #homesforsale #listings #realestate #teamhlmiami #dreamhome #justlisted #realestateagent
617,2020-04-27,es,fl,el estado de nueva york el mas impactado por la pandemia de covid19 en ee uu registro este domingo su cifra de fallecidos diarios mas baja del mesinformo de 367 nuevos fallecidos 70 menos que el dia anterior y descensos en las hospitalizaciones intubaciones y positivos,
621,2020-04-27,es,fl,la encuesta nacional de impacto de covid19 realizado por revela que mas de mas del 60 de los hospitales del pais no cuentan con guantes y tapabocas,#27abr #tvv
629,2020-04-27,es,fl,industrias aumentan la publicidad en podcasts durante la pandemia estilo de vida 78 articulos para el hogar 81 y alcohol 85 investigacion impacto del covid19 en los oyentes digitales en puerto rico revela que siguen usando los,
695,2020-04-28,es,fl,citgo analizo el impacto del covid19 en la migracion venezolana en latinoamerica,#28abr #internacionales #cambioweb
700,2020-04-28,es,fl,insistio en que mantendra las de su pais cerradas a hasta que mejore la situacion en el viejo continente y califico de el impacto de la enfermedad del,#trump #fronteras #europa #tragico #covid19


# May 15th - June 15th

In [113]:
!python3 coveet.py query -g fl -l en es -d 2020-05-15 2020-06-15

Namespace(all=False, date=[datetime.datetime(2020, 5, 15, 0, 0), datetime.datetime(2020, 6, 15, 0, 0)], func=<function handle_query at 0x7fb81d6ea550>, geo=['fl'], lang=['en', 'es'], search=None)
wrote df to dhcovid_2020-5-15_2020-6-15_en_es_fl.csv 🎉


In [114]:
df = pd.read_csv('dhcovid_2020-5-15_2020-6-15_en_es_fl.csv', index_col=0)

In [115]:
!python3 coveet.py tidy -file dhcovid_2020-5-15_2020-6-15_en_es_fl.csv -stopwords ../stopwords/stopwords_es.txt ../stopwords/stopwords_en.txt ../stopwords/stopwords_hashtags.txt

Namespace(file='dhcovid_2020-5-15_2020-6-15_en_es_fl.csv', func=<function handle_tidy at 0x7fdb34e6c670>, lemmatize=False, search=None, stopwords=['../stopwords/stopwords_es.txt', '../stopwords/stopwords_en.txt', '../stopwords/stopwords_hashtags.txt'])
wrote tidied df to dhcovid_2020-5-15_2020-6-15_en_es_fl_stopworded.csv 🎉


## Top words (05/15-06/15)

In [391]:
df = pd.read_csv('dhcovid_2020-5-15_2020-6-15_en_es_fl_stopworded.csv', index_col=0) # run from here using this file
df['date'] = pd.to_datetime(df['date'])
df['text'] = df['text'].str.split()
df['hashtags'] = df['hashtags'].str.split()
top_n = 100  # top 10 words
n_gram = 1  # ask for word frequency (i.e. an n-gram where n=1)
col_name = 'text'  # set to either 'text' or 'hashtags'
df_text = df.dropna(subset=[col_name])
counts = {(g, l): Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n)
          for (g, l), t in df_text.groupby(["geo", "lang"])[col_name]}
#pprint(counts)

In [392]:
tab_df_data = {}
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [393]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,pandemic,1185,0.07
1,people,1022,0.06
2,cases,1018,0.06
3,florida,869,0.05
4,coronavirus,728,0.04
5,like,705,0.04
6,health,699,0.04
7,time,606,0.03
8,deaths,587,0.03
9,trump,569,0.03


In [394]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,casos,729,0.14
1,pandemia,400,0.08
2,nuevos,311,0.06
3,coronavirus,289,0.06
4,muertes,257,0.05
5,contagios,256,0.05
6,eeuu,248,0.05
7,florida,245,0.05
8,pais,243,0.05
9,venezuela,242,0.05


## Top 50 hashtags (05/15-06/15)

In [396]:
top_n = 50  # top 50 words
n_gram = 1  # ask for word frequency (i.e. an n-gram where n=1)
col_name = 'hashtags'  # set to either 'text' or 'hashtags'
df_text = df.dropna(subset=[col_name])
counts = {(g, l): Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n)
          for (g, l), t in df_text.groupby(["geo", "lang"])[col_name]}
#pprint(counts)

In [397]:
tab_df_data = {}
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [398]:
pd.set_option('display.max_rows', 100)

In [399]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,#health,177,0.01
1,#florida,128,0.01
2,#breaking,126,0.01
3,#miami,106,0.01
4,#pandemic,100,0.01
5,#news,97,0.01
6,#quarantine,63,0.0
7,#healthcare,62,0.0
8,#maritime,62,0.0
9,#cdc,60,0.0


In [400]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,#cambioweb,387,0.08
1,#tvv,313,0.06
2,#tvvnoticias,281,0.06
3,#evnews,242,0.05
4,#internacionales,196,0.04
5,#nacionales,191,0.04
6,#pandemia,121,0.02
7,#esnoticia,74,0.01
8,#venezuela,73,0.01
9,#eeuu,59,0.01


## Unique 50 hashtags (05/15-06/15)

In [401]:
col_name = "hashtags"
df = df.dropna(subset=[col_name])
grouping = df.groupby(["geo", "lang"])[col_name]
vocab_dic = uniq_vocab_by_group(grouping)  # unique vocab dictionary 
df[col_name] = df.apply(lambda x : set(x[col_name]) & vocab_dic[(x['geo'], x['lang'])],
                        result_type='reduce', axis=1)  # filtering tweets by that dic
counts = {k: Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n) for k, t in grouping}
#pprint(counts)

In [402]:
tab_df_data = {}
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [403]:
pd.set_option('display.max_rows', 100)

In [405]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,#health,177,0.04
1,#maritime,62,0.01
2,#hr,47,0.01
3,#smallbusiness,36,0.01
4,#reopening,35,0.01
5,#zoom,34,0.01
6,#southflorida,33,0.01
7,#360wisemedia,33,0.01
8,#northofnyc,32,0.01
9,#foxnews,31,0.01


In [406]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,#cambioweb,387,0.18
1,#tvv,313,0.14
2,#tvvnoticias,281,0.13
3,#evnews,242,0.11
4,#internacionales,196,0.09
5,#nacionales,191,0.09
6,#esnoticia,74,0.03
7,#cubanosporelmundo,57,0.03
8,#estrending,39,0.02
9,#brasil,34,0.02


## Concordances (05/15-06/15)

In [453]:
df_concord = pd.read_csv('dhcovid_2020-5-15_2020-6-15_en_es_fl.csv', index_col=0)
df_concord = df_concord.dropna(subset=["text"])

In [454]:
pd.set_option('display.max_rows', df_concord.shape[0]+1)

In [455]:
# df_concord.to_csv("sent-results-to-a.csv")

In [456]:
filt = lambda text: 'gasolina' in text  # a single word
#filt = lambda text: 'college' in text and 'party' in text      # a logical expression 
# filt = lambda text: 'trump' in text and not 'china' in text    # another one to try
df_concord = df_concord[df_concord.apply(lambda x: filt(x["text"]), axis=1)]
df_concord 

Unnamed: 0,date,lang,geo,text,hashtags
18052,2020-05-15,es,fl,la encuesta nacional sobre el impacto de covid19 en venezuela refleja el incremento en la escasez de gasolina desde que inicio la cuarentena en el pais,#15may #tvvnoticias #tvv
18088,2020-05-15,es,fl,el pais con las mayores reservas probadas de petroleo en el planeta se ve sacudido por una escasez de gasolina que ya cumple varias semanas y esto ocurre en periodo de cuarentena por el covid19,#venezuela #15may
18103,2020-05-15,es,fl,francesco pupillo ci 18188514 fue secuestrado en caño amarillo a las 9 am trabaja de escolta de seguridad de ayudase a saber su paradero gusano covid19 sin gasolina,#cicpc #douglasrico #el
18366,2020-05-17,es,fl,menos litros de gasolina encuentran mas casos de covid19 en uno de los paises con cuarentena mas estricta,
19050,2020-05-20,es,fl,el aislamiento de venezuela y los problemas de movilidad a raiz del transporte y la gasolina sumados a la cuarentena han retrasado brores masivos de covid19 no se confie,
20062,2020-05-26,es,fl,resultados de la encuesta nacional sobre el impacto de covid19 en venezuela reflejan la evolucion en la escasez de gasolina y en otros servicios basicos como el agua y el gas,#26may #tvvnoticias #tvv
20264,2020-05-28,es,fl,policia nacional sorprende a gnb vendiendo gasolina 19 via,#28may #nacionales #cambioweb #covid
20447,2020-05-28,es,fl,estoy de acuerdo en que la gasolina hay que cobrarla dice al informar que dirige un grupo de especialistas para ver el monto del combustible que vendra dentro del plan de normalizacion en medio de la cuarentena por el covid19,#tvv
20477,2020-05-29,es,fl,por permitirme burlarme de uds con las cifras del covid19 por matarlos de hambre por no detener la hiperinflacion por mas de 3 años por quitarles la gasolina por devaluar la moneda gracias pueblo de pendejos gracias por dejarme robar,#graciaspuebloheroico
20695,2020-05-30,es,fl,analista en vtv sugiere que litro de gasolina puede costar 1 dolar 19 via,#30may #nacionales #cambioweb #covid


# June 15th - July 15th

In [142]:
!python3 coveet.py query -g fl -l en es -d 2020-06-15 2020-07-15 

Namespace(all=False, date=[datetime.datetime(2020, 6, 15, 0, 0), datetime.datetime(2020, 7, 15, 0, 0)], func=<function handle_query at 0x7f395dd9e550>, geo=['fl'], lang=['en', 'es'], search=None)
wrote df to dhcovid_2020-6-15_2020-7-15_en_es_fl.csv 🎉


In [202]:
df = pd.read_csv('dhcovid_2020-6-15_2020-7-15_en_es_fl.csv', index_col=0)

In [203]:
!python3 coveet.py tidy -file dhcovid_2020-6-15_2020-7-15_en_es_fl.csv -stopwords ../stopwords/stopwords_en.txt ../stopwords/stopwords_es.txt ../stopwords/stopwords_hashtags.txt

Namespace(file='dhcovid_2020-6-15_2020-7-15_en_es_fl.csv', func=<function handle_tidy at 0x7fbd22b32670>, lemmatize=False, search=None, stopwords=['../stopwords/stopwords_en.txt', '../stopwords/stopwords_es.txt', '../stopwords/stopwords_hashtags.txt'])
wrote tidied df to dhcovid_2020-6-15_2020-7-15_en_es_fl_stopworded.csv 🎉


## Top 100 words (06/15-07/15)

In [408]:
df = pd.read_csv('dhcovid_2020-6-15_2020-7-15_en_es_fl_stopworded.csv', index_col=0) # run from here using this file
df['date'] = pd.to_datetime(df['date'])
df['text'] = df['text'].str.split()
df['hashtags'] = df['hashtags'].str.split()
top_n = 100  # top 100 words
n_gram = 1  # ask for word frequency (i.e. an n-gram where n=1)
col_name = 'text'  # set to either 'text' or 'hashtags'
df = df.dropna(subset=[col_name])
counts = {(g, l): Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n)
          for (g, l), t in df.groupby(["geo", "lang"])[col_name]}
# pprint(counts)

In [409]:
tab_df_data = {}
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [410]:
pd.set_option('display.max_rows', 100)

In [411]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,cases,2691,0.12
1,florida,2390,0.11
2,people,1590,0.07
3,positive,1273,0.06
4,pandemic,1093,0.05
5,like,962,0.04
6,miami,908,0.04
7,tested,858,0.04
8,trump,849,0.04
9,covid,818,0.04


In [412]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,casos,897,0.18
1,florida,372,0.08
2,nuevos,356,0.07
3,positivo,324,0.07
4,pandemia,318,0.07
5,contagios,308,0.06
6,coronavirus,246,0.05
7,salud,224,0.05
8,pais,202,0.04
9,muertes,191,0.04


## Top 50 hashtags (06/15-07/15)

In [413]:
top_n = 50  # top 50 words
n_gram = 1  # ask for word frequency (i.e. an n-gram where n=1)
col_name = 'hashtags'  # set to either 'text' or 'hashtags'
df_text = df.dropna(subset=[col_name])
counts = {(g, l): Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n)
          for (g, l), t in df_text.groupby(["geo", "lang"])[col_name]}
#pprint(counts)

In [414]:
tab_df_data = {}
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [415]:
pd.set_option('display.max_rows', 100)

In [416]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,#florida,211,0.01
1,#breaking,163,0.01
2,#miami,145,0.01
3,#news,140,0.01
4,#wearamask,86,0.0
5,#health,80,0.0
6,#pandemic,75,0.0
7,#cdc,71,0.0
8,#miamidade,59,0.0
9,#4boca,57,0.0


In [417]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,#tvv,344,0.07
1,#cambioweb,282,0.06
2,#tvvnoticias,267,0.06
3,#evnews,187,0.04
4,#nacionales,159,0.03
5,#florida,124,0.03
6,#internacionales,121,0.02
7,#mn24,74,0.02
8,#venezuela,73,0.02
9,#eeuu,66,0.01


## Unique hastags  (06/15-07/15)

In [418]:
col_name = "hashtags"
df = df.dropna(subset=[col_name])
grouping = df.groupby(["geo", "lang"])[col_name]
vocab_dic = uniq_vocab_by_group(grouping)  # unique vocab dictionary 
df[col_name] = df.apply(lambda x : set(x[col_name]) & vocab_dic[(x['geo'], x['lang'])],
                        result_type='reduce', axis=1)  # filtering tweets by that dic
counts = {k: Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n) for k, t in grouping}
#pprint(counts)

In [419]:
tab_df_data = {}
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [420]:
pd.set_option('display.max_rows', 100)

In [421]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,#wearamask,86,0.02
1,#pandemic,75,0.02
2,#4boca,57,0.01
3,#npr,53,0.01
4,#southflorida,40,0.01
5,#maritime,35,0.01
6,#us,32,0.01
7,#floridacovidepicenter,32,0.01
8,#business,30,0.01
9,#staysafe,29,0.01


In [422]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,#tvv,344,0.17
1,#cambioweb,282,0.14
2,#tvvnoticias,267,0.13
3,#evnews,187,0.09
4,#nacionales,159,0.08
5,#internacionales,121,0.06
6,#mn24,74,0.04
7,#politica,49,0.02
8,#esnoticia,43,0.02
9,#lev,37,0.02


## Concordances (06/15-07/15)

In [306]:
df_concord = pd.read_csv('dhcovid_2020-6-15_2020-7-15_en_es_fl.csv', index_col=0)
df_concord = df_concord.dropna(subset=["text"])

In [307]:
pd.set_option('display.max_rows', df_concord.shape[0]+1)

In [308]:
# df_concord.to_csv("prueba-concordancia.csv")

In [309]:
filt = lambda text: 'crisis' in text  # a single word
#filt = lambda text: 'college' in text and 'party' in text      # a logical expression 
# filt = lambda text: 'trump' in text and not 'china' in text    # another one to try
df_concord = df_concord[df_concord.apply(lambda x: filt(x["text"]), axis=1)]
df_concord 

Unnamed: 0,date,lang,geo,text,hashtags
28,2020-06-15,es,fl,florangel quintana mientras mis amigos de la genx ven empañado ese vidrio del entusiasmo mundial por lo aprendido durante la crisis del covid19 mi millennial lo ve tras sus lentes de vr :_emoji_not_identified_U2192_:,
33,2020-06-15,es,fl,la crisis generada por la pandemia de covid19 ha puesto en jaque a los teatros hispanos de la ciudad en las manos de nuestro alcalde podriamos encontrar respuestas y soluciones,
313,2020-06-16,es,fl,el arte genera al cerebro la capacidad de transformar al la crisis generada por la pandemia de covid19 ha puesto en jaque a los teatros hispanos de la ciudad en las manos de,#weneedyourhelp #needyourhelp
317,2020-06-16,es,fl,carmelo mesalago cuba crisis economica sus causas el covid19 y las politicas de rescate,
340,2020-06-17,es,fl,en usa estamos como en chile la diferencia es que en la gente no esta pasando crisis economicas extremas igual el covid19 a este nivel esimparable,
353,2020-06-17,es,fl,negocio familiar vende hamburguesas a domicilio en medio de la crisis sanitaria por covid19 mientras vuelve a los foros de grabacion,
375,2020-06-17,es,fl,el gobierno ha aprobado mas de 4300 millones de lempiras o unos 175 millones de dolares para abordar la crisis de covid19 pero los profesionales de la salud se quejan de los hospitales desabastecidos ahora el presidente dice que dio positivo,
468,2020-06-17,es,fl,onu advierte que la crisis del covid19 esta empujando a 40 millones de personas a una situacion de inseguridad alimentaria en america latina y el caribe,
488,2020-06-18,es,fl,entrevista a robert kiyosaki la crisis economica global por covid19 puedes verla en nuestro canal,#emprender #robertkiyosaki
512,2020-06-18,es,fl,abrio 177 indagaciones por hechos de corrupcion relacionados con recursos publicos para atender la crisis sanitaria del,#colombia #coronavirus


# July 15th - August 15th 

In [31]:
!python3 coveet.py query -g fl -l en es -d 2020-07-15 2020-08-15

Namespace(all=False, date=[datetime.datetime(2020, 7, 15, 0, 0), datetime.datetime(2020, 8, 15, 0, 0)], func=<function handle_query at 0x7f5338171550>, geo=['fl'], lang=['en', 'es'], search=None)
wrote df to dhcovid_2020-7-15_2020-8-15_en_es_fl.csv 🎉


In [210]:
df = pd.read_csv('dhcovid_2020-7-15_2020-8-15_en_es_fl.csv', index_col=0)

In [211]:
!python3 coveet.py tidy -file dhcovid_2020-7-15_2020-8-15_en_es_fl.csv -stopwords ../stopwords/stopwords_es.txt ../stopwords/stopwords_en.txt ../stopwords/stopwords_hashtags.txt

Namespace(file='dhcovid_2020-7-15_2020-8-15_en_es_fl.csv', func=<function handle_tidy at 0x7f1eccc60670>, lemmatize=False, search=None, stopwords=['../stopwords/stopwords_es.txt', '../stopwords/stopwords_en.txt', '../stopwords/stopwords_hashtags.txt'])
wrote tidied df to dhcovid_2020-7-15_2020-8-15_en_es_fl_stopworded.csv 🎉


## Top 100 words (07/15-08/15)

In [423]:
df = pd.read_csv('dhcovid_2020-7-15_2020-8-15_en_es_fl_stopworded.csv', index_col=0) # run from here using this file
df['date'] = pd.to_datetime(df['date'])
df['text'] = df['text'].str.split()
df['hashtags'] = df['hashtags'].str.split()
top_n = 100  # top 10 words
n_gram = 1  # ask for word frequency (i.e. an n-gram where n=1)
col_name = 'text'  # set to either 'text' or 'hashtags'
df_text = df.dropna(subset=[col_name])
counts = {(g, l): Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n)
          for (g, l), t in df_text.groupby(["geo", "lang"])[col_name]}
#pprint(counts)

In [424]:
tab_df_data = {}
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [425]:
pd.set_option('display.max_rows', 100)

In [426]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,florida,1892,0.08
1,cases,1733,0.07
2,people,1455,0.06
3,positive,1205,0.05
4,pandemic,1131,0.05
5,deaths,1075,0.05
6,like,1019,0.04
7,trump,982,0.04
8,miami,918,0.04
9,test,885,0.04


In [427]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,casos,921,0.15
1,vacuna,441,0.07
2,pandemia,420,0.07
3,florida,419,0.07
4,coronavirus,359,0.06
5,muertes,347,0.05
6,nuevos,329,0.05
7,contagios,320,0.05
8,venezuela,295,0.05
9,positivo,294,0.05


## Top 50 hashtags (07/15-08/15)

In [428]:
top_n = 50  # top 50 words
n_gram = 1  # ask for word frequency (i.e. an n-gram where n=1)
col_name = 'hashtags'  # set to either 'text' or 'hashtags'
df_text = df.dropna(subset=[col_name])
counts = {(g, l): Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n)
          for (g, l), t in df_text.groupby(["geo", "lang"])[col_name]}
#pprint(counts)

In [429]:
tab_df_data = {}
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [430]:
pd.set_option('display.max_rows', 100)

In [431]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,#florida,194,0.01
1,#news,192,0.01
2,#miami,142,0.01
3,#breaking,134,0.01
4,#4boca,114,0.0
5,#pandemic,93,0.0
6,#wearamask,77,0.0
7,#health,74,0.0
8,#marlins,71,0.0
9,#cdc,65,0.0


In [432]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,#cambioweb,697,0.11
1,#tvv,454,0.07
2,#nacionales,418,0.07
3,#tvvnoticias,363,0.06
4,#internacionales,281,0.04
5,#florida,224,0.04
6,#evnews,168,0.03
7,#mn24,155,0.02
8,#politica,127,0.02
9,#miami,111,0.02


## Unique 50 hastags (07/15-08/15)

In [433]:
col_name = "hashtags"
df = df.dropna(subset=[col_name])
grouping = df.groupby(["geo", "lang"])[col_name]
vocab_dic = uniq_vocab_by_group(grouping)  # unique vocab dictionary 
df[col_name] = df.apply(lambda x : set(x[col_name]) & vocab_dic[(x['geo'], x['lang'])],
                        result_type='reduce', axis=1)  # filtering tweets by that dic
counts = {k: Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n) for k, t in grouping}
#pprint(counts)

In [434]:
tab_df_data = {}
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [435]:
pd.set_option('display.max_rows', 100)

In [436]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,#breaking,134,0.03
1,#4boca,114,0.02
2,#health,74,0.02
3,#npr,48,0.01
4,#newsreport,45,0.01
5,#southflorida,42,0.01
6,#unitedstates,40,0.01
7,#maritime,33,0.01
8,#healthcare,33,0.01
9,#illinois,33,0.01


In [437]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,#cambioweb,697,0.24
1,#tvv,454,0.16
2,#nacionales,418,0.15
3,#tvvnoticias,363,0.13
4,#internacionales,281,0.1
5,#evnews,168,0.06
6,#mn24,155,0.05
7,#politica,127,0.04
8,#eeuu,103,0.04
9,#venezuela,80,0.03


## Concordances (07/15-08/15)

In [310]:
df_concord = pd.read_csv('dhcovid_2020-7-15_2020-8-15_en_es_fl.csv', index_col=0)
df_concord = df_concord.dropna(subset=["text"])

In [311]:
pd.set_option('display.max_rows', df_concord.shape[0]+1)

In [312]:
# df_concord.to_csv("prueba-concordancia.csv")

In [313]:
filt = lambda text: 'university of miami' in text  # a single word
#filt = lambda text: 'college' in text and 'party' in text      # a logical expression 
# filt = lambda text: 'trump' in text and not 'china' in text    # another one to try
df_concord = df_concord[df_concord.apply(lambda x: filt(x["text"]), axis=1)]
df_concord 

Unnamed: 0,date,lang,geo,text,hashtags
1188,2020-07-16,en,fl,the university of miami has reportedly shut down football workouts after at least three players test positive for covid19,
1353,2020-07-16,en,fl,in the midst of the covid19 pandemic the university of miami is front and center in efforts to find a vaccine for the virus,
3053,2020-07-17,en,fl,the university of miami miller school of medicine is looking for volunteers in south florida who are willing to be part of a nationwide study to test potential covid19 vaccines,
3297,2020-07-17,en,fl,university of miami wants you to participate in coronavirus vaccine trials the volunteers must be ages 18 to 55 and should not have had covid19,
3628,2020-07-18,en,fl,um to launch covid19 vaccine testing site for the covid19 prevention trials network inventum university of miami miller school of medicine :_emoji_not_identified_U2066_: :_emoji_not_identified_U2069_:,#medtwitter #covid19 #coronavirus #sarscov2
5622,2020-07-21,en,fl,erin n marcus a professor of clinical medicine at the university of miami school of medicine says florida covid19 outbreak wont improve anytime soon,
5902,2020-07-21,en,fl,i think were gonna need a bigger boat jaws university of miami will turn a dorm into covid19 quarantine space,
5925,2020-07-21,en,fl,the university of miami is already making plans for what will happen when one the inevitable occurs and one or more of its 17000 students gets sick,
5937,2020-07-21,en,fl,the university of miami is converting one of its dorms into a quarantine space for students with covid19,
5950,2020-07-21,en,fl,university of miami will turn a dorm into covid19 quarantine space,


# August 15th - September 15th 

In [40]:
!python3 coveet.py query -g fl -l en es -d 2020-08-15 2020-09-15

Namespace(all=False, date=[datetime.datetime(2020, 8, 15, 0, 0), datetime.datetime(2020, 9, 15, 0, 0)], func=<function handle_query at 0x7f581de4e550>, geo=['fl'], lang=['en', 'es'], search=None)
wrote df to dhcovid_2020-8-15_2020-9-15_en_es_fl.csv 🎉


In [226]:
df = pd.read_csv('dhcovid_2020-8-15_2020-9-15_en_es_fl.csv', index_col=0)

In [227]:
!python3 coveet.py tidy -file dhcovid_2020-8-15_2020-9-15_en_es_fl.csv -stopwords ../stopwords/stopwords_en.txt ../stopwords/stopwords_es.txt ../stopwords/stopwords_hashtags.txt

Namespace(file='dhcovid_2020-8-15_2020-9-15_en_es_fl.csv', func=<function handle_tidy at 0x7ff1524cc670>, lemmatize=False, search=None, stopwords=['../stopwords/stopwords_en.txt', '../stopwords/stopwords_es.txt', '../stopwords/stopwords_hashtags.txt'])
wrote tidied df to dhcovid_2020-8-15_2020-9-15_en_es_fl_stopworded.csv 🎉


## Top 100 words (08/15-09/15)

In [438]:
df = pd.read_csv('dhcovid_2020-8-15_2020-9-15_en_es_fl_stopworded.csv', index_col=0) # run from here using this file
df['date'] = pd.to_datetime(df['date'])
df['text'] = df['text'].str.split()
df['hashtags'] = df['hashtags'].str.split()
top_n = 100  # top 100 words
n_gram = 1  # ask for word frequency (i.e. an n-gram where n=1)
col_name = 'text'  # set to either 'text' or 'hashtags'
df = df.dropna(subset=[col_name])
counts = {(g, l): Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n)
          for (g, l), t in df.groupby(["geo", "lang"])[col_name]}
#pprint(counts)

In [439]:
tab_df_data = {}
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [440]:
pd.set_option('display.max_rows', 100)

In [441]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,cases,1023,0.06
1,pandemic,1005,0.06
2,people,971,0.06
3,florida,873,0.05
4,trump,821,0.05
5,deaths,730,0.04
6,like,712,0.04
7,covid,694,0.04
8,health,614,0.04
9,positive,598,0.04


In [442]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,casos,591,0.13
1,vacuna,369,0.08
2,pandemia,300,0.06
3,nuevos,263,0.06
4,contagios,241,0.05
5,venezuela,236,0.05
6,muertes,236,0.05
7,salud,231,0.05
8,florida,219,0.05
9,coronavirus,214,0.05


## Top 50 hashtags (08/15-09/15)

In [443]:
top_n = 50  # top 50 words
n_gram = 1  # ask for word frequency (i.e. an n-gram where n=1)
col_name = 'hashtags'  # set to either 'text' or 'hashtags'
df_text = df.dropna(subset=[col_name])
counts = {(g, l): Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n)
          for (g, l), t in df_text.groupby(["geo", "lang"])[col_name]}
#pprint(counts)

In [444]:
tab_df_data = {}
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [445]:
pd.set_option('display.max_rows', 100)

In [446]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,#breaking,122,0.01
1,#florida,89,0.01
2,#360wisenews,75,0.0
3,#miami,73,0.0
4,#news,67,0.0
5,#cdc,64,0.0
6,#360wisetv,62,0.0
7,#pandemic,58,0.0
8,#canada,56,0.0
9,#360wisecanada,55,0.0


In [447]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,#cambioweb,744,0.16
1,#nacionales,431,0.09
2,#internacionales,323,0.07
3,#tvv,312,0.07
4,#politica,298,0.06
5,#tvvnoticias,253,0.05
6,#florida,83,0.02
7,#noticiasmiaminews24,52,0.01
8,#cuba,51,0.01
9,#eeuu,46,0.01


## Unique 50 hashtags (08/15-09/15)

In [448]:
col_name = "hashtags"
df = df.dropna(subset=[col_name])
grouping = df.groupby(["geo", "lang"])[col_name]
vocab_dic = uniq_vocab_by_group(grouping)  # unique vocab dictionary 
df[col_name] = df.apply(lambda x : set(x[col_name]) & vocab_dic[(x['geo'], x['lang'])],
                        result_type='reduce', axis=1)  # filtering tweets by that dic
counts = {k: Counter(count_ngrams(t, n_gram, consecutive=False)).most_common(top_n) for k, t in grouping}
#pprint(counts)

In [449]:
tab_df_data = {}
for gl, word_counts in counts.items():
    gl_data = {'word' : [], '# occurrences' : [], 'percentage': []}
    sub_df = df[(df['geo'] == gl[0]) & (df['lang'] == gl[1])]
    for w in word_counts:
        # append a row to the dictionary 
        gl_data['word'].append(" ".join(w[0]))
        gl_data['# occurrences'].append(w[1])
        gl_data['percentage'].append(round(w[1] / len(sub_df), 2))
    tab_df_data[gl] = pd.DataFrame(gl_data)

In [450]:
pd.set_option('display.max_rows', 100)

In [451]:
tab_df_data[('fl', 'en')]

Unnamed: 0,word,# occurrences,percentage
0,#breaking,122,0.04
1,#360wisenews,75,0.02
2,#360wisetv,62,0.02
3,#360wisecanada,55,0.02
4,#newsreport,50,0.01
5,#npr,44,0.01
6,#california,39,0.01
7,#healthcare,33,0.01
8,#workersfirst,29,0.01
9,#travel,29,0.01


In [452]:
tab_df_data[('fl', 'es')]

Unnamed: 0,word,# occurrences,percentage
0,#cambioweb,744,0.34
1,#nacionales,431,0.19
2,#internacionales,323,0.15
3,#tvv,312,0.14
4,#politica,298,0.13
5,#tvvnoticias,253,0.11
6,#noticiasmiaminews24,52,0.02
7,#eeuu,46,0.02
8,#estrending,39,0.02
9,#lev,37,0.02


## Concordances (08/15-09/15)

In [294]:
df_concord = pd.read_csv('dhcovid_2020-8-15_2020-9-15_en_es_fl.csv', index_col=0)
df_concord = df_concord.dropna(subset=["text"])

In [295]:
pd.set_option('display.max_rows', df_concord.shape[0]+1)

In [296]:
# df_concord.to_csv("prueba-concordancia.csv")

In [297]:
filt = lambda text: 'universidad' in text  # a single word
#filt = lambda text: 'college' in text and 'party' in text      # a logical expression 
# filt = lambda text: 'trump' in text and not 'china' in text    # another one to try
df_concord = df_concord[df_concord.apply(lambda x: filt(x["text"]), axis=1)]
df_concord 

Unnamed: 0,date,lang,geo,text,hashtags
16699,2020-08-15,es,fl,estados unidos alcanzo este sabado la cifra de 5345610 casos confirmados de covid19 y la de 169313 fallecidos de acuerdo con el recuento independiente de la universidad johns hopkins,#15ago #tvvnoticias
16773,2020-08-15,es,fl,estudio de universidad de florida particulas de covid19 que flotan en el aire son infecciosas y pueden contagiar,
16794,2020-08-15,es,fl,la universidad johns hopkins reporta 21239182 contagios globales por covid19 y 766414 fallecimientos,
16872,2020-08-16,es,fl,estados unidos alcanzo este domingo la cifra de 5388931 casos confirmados de covid19 y la de 169841 fallecidos de acuerdo con el recuento independiente de la universidad johns hopkins,#16ago #tvvnoticias
17022,2020-08-17,es,fl,cientificos de la universidad northwestern descubren vulnerabilidad del covid19,
17125,2020-08-17,es,fl,universidad johns hopkins reporta 21707773 casos de covid19 en el mundo y 775926 fallecidos,
17189,2020-08-17,es,fl,el presidente mexicano andres manuel lopez obrador afirmo este domingo que hay certidumbre sobre la eficacia de la vacuna contra la covid19 de la universidad de oxford y astrazeneca por lo que podria iniciar una campaña de vacunacion el primer trimestre de 2021,#16ago
17289,2020-08-18,es,fl,para este martes estados unidos contabiliza mas de 5 millones 445 mil casos de covid19 y 179 fallecidos por la enfermedad segun datos de la universidad johns hopkins,#18ago #tvvnoticias #tvv
17327,2020-08-18,es,fl,universidad johns hopkins reporta 21913816 casos de covid19 en el mundo y 774682 fallecidos,
17342,2020-08-18,es,fl,segun la universidad johns hopkins hasta ahora se contabilizan 21916639 casos de covid19 en el mundo y 774720 fallecidos a causa de la pandemia,#18ago #tvvnoticias #tvv
