### Creates 3 variables to check if the search string appears in the texts (**str_in_text**) of the tweets, in the usernames (**str_in_user**) or in the related conversations (**str_in_conv**) 

In [1]:
import pandas as pd
import string
import unidecode
from tqdm import tqdm, tqdm_notebook
pd.options.display.max_colwidth = 250
pd.__version__ # marche pas avec pandas 0.25 https://github.com/tqdm/tqdm/issues/780
tqdm_notebook().pandas()

'0.24.2'

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [2]:
INPUT_FILE_GZIP = './_sources_final/pickles/pression_sociale_rounds_1_2_3_corrected_search_str_opt_clean_text_dup100_clean.pickle.gzip'

In [3]:
df = pd.read_pickle(INPUT_FILE_GZIP, compression='gzip')

In [4]:
len(df)

15882669

In [5]:
df.head(2)

Unnamed: 0,id,conversation_id,date_end,date_start,datetime,is_quote,is_reply,is_retweet,nbr_favorite,nbr_reply,nbr_retweet,search_string,text,user_id,user_name,string_date,text_clean,dup_100
0,940606059626090496,940606059626090496,2018-02-25,2017-12-11,2017-12-12 16:35:34,False,False,False,3,0,2,Accord Healthcare,We talk exclusively to Accord Healthcare 's EU Supply Operations Director about the firms #supplychain competence and agility in the crowded #healthcare market http:// bit.ly/2z3W22g pic.twitter.com/jKuLRbUQqT,60939437,SupplyChainD,accordhealthcare_2017-12-11,we talk exclusively to accord healthcare s eu supply operations director about the firms supply chain competence and agility in the crowded healthcare market,False
1,940254856471896064,940254856471896064,2018-02-25,2017-12-11,2017-12-11 17:20:01,False,False,False,0,0,0,Accord Healthcare,"""The equitable/legal title conundrum and claiming priority in #patent applications"" \n\nRead our article following the Accord Healthcare Limited v Research Corporation Technologies Inc, 2017 decision: http:// bit.ly/2yeZfrT \n\n#priority ...",498639837,VennerShipley,accordhealthcare_2017-12-11,the equitable legal title conundrum and claiming priority in patent applications read our article following the accord healthcare limited v research corporation technologies inc decision priority,False


In [6]:
# on cherche si tous les éléments du search string sont dans le texte
# la raison est que les termes #Bae #Systems, Bae   Systems, Bae #Systems doivent être valables pour Bae Systems
# pareil j.p.morgan = jp morgan = j. p. morgan etc.
# enlever les accennts aussi : giraffe cafés
# attention Jäger = Jaeger, tenir compte de cela aussi
# sainsburys retourne aussi sainsbury et jet2.com retourne jet2
# la recherche sur twitter fonctionne comme cela
def is_in_str(txt, st):
    
    no_punctuation = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
    
    st = unidecode.unidecode(st).lower()
    
    st = st.replace('jet2.com', 'jet2') # avant la ponctuation
    # il faudrait le relancer avec kukd.com aussi
    st = st.replace('sainsburys', 'sainsbury')
    
    st = st.translate(no_punctuation)
    st = ' '.join(st.split()) # no multiple whitespaces
    
    txt = txt.translate(no_punctuation)
    txt = ' '.join(txt.split()) # no multiple whitespaces
    txt = unidecode.unidecode(txt).lower()
    
    txt = txt.replace('jager', 'jaeger') # après la ponctuation
    
    return all([True if txt.find(s) != -1 else False for s in st.split()])

In [7]:
## tests
test = 'Hi Jordan, would you send us your message via our FB page please? cheers, Jäger -John'
print(is_in_str(test, 'Jaeger'))
test = 'Hi Jordan, would you send us your message via our FB page please? cheers, jet2 -John'
print(is_in_str(test, 'Jet2.com'))
test = 'Hi Jordan, would you send us your message via our FB page please? cheers, #Bae #System -John'
print(is_in_str(test, 'Bae System'))

True
True
True


#### Check if search string is not in text

In [8]:
df.loc[:,'str_in_text'] = df.progress_apply(lambda row: is_in_str(row['text'],row['search_string']), axis=1)

HBox(children=(IntProgress(value=0, max=15882669), HTML(value='')))




In [9]:
# Tweets with the string not it the tweet
len(df[df.str_in_text == False])

1439161

#### Check if search string is in the user_name

In [10]:
df.loc[:,'str_in_user'] = df.progress_apply(lambda row: is_in_str(row['user_name'],row['search_string']), axis=1)

HBox(children=(IntProgress(value=0, max=15882669), HTML(value='')))




In [11]:
# Tweets with the string in user name
len(df[df.str_in_user == True])

1216393

#### Check if search string is in the conversation text

In [12]:
def is_in_conv(id, conversation_id, st, str_in_text):
    if (str_in_text == True) | (id == conversation_id) : # si la chaine est déjà dans le texte ou si pas de discussion
        return False
    else:
        try:
            conversation = df[df.id == conversation_id].text.values[0] # vérifie si le texte d'origie de la discussion existe
            return is_in_str(conversation, st) # vérifie si la chaine est dans ce texte
        except:
            return False

In [13]:
df.loc[:,'str_in_conv'] = df.progress_apply(lambda row: is_in_conv(row['id'],row['conversation_id'],row['search_string'],row['str_in_text']), axis=1)

HBox(children=(IntProgress(value=0, max=15882669), HTML(value='')))




In [14]:
# Tweets with the string in the conversation text
len(df[df.str_in_conv == True])

572850

### Save file

In [15]:
df.head(2)

Unnamed: 0,id,conversation_id,date_end,date_start,datetime,is_quote,is_reply,is_retweet,nbr_favorite,nbr_reply,...,search_string,text,user_id,user_name,string_date,text_clean,dup_100,str_in_text,str_in_user,str_in_conv
0,940606059626090496,940606059626090496,2018-02-25,2017-12-11,2017-12-12 16:35:34,False,False,False,3,0,...,Accord Healthcare,We talk exclusively to Accord Healthcare 's EU Supply Operations Director about the firms #supplychain competence and agility in the crowded #healthcare market http:// bit.ly/2z3W22g pic.twitter.com/jKuLRbUQqT,60939437,SupplyChainD,accordhealthcare_2017-12-11,we talk exclusively to accord healthcare s eu supply operations director about the firms supply chain competence and agility in the crowded healthcare market,False,True,False,False
1,940254856471896064,940254856471896064,2018-02-25,2017-12-11,2017-12-11 17:20:01,False,False,False,0,0,...,Accord Healthcare,"""The equitable/legal title conundrum and claiming priority in #patent applications"" \n\nRead our article following the Accord Healthcare Limited v Research Corporation Technologies Inc, 2017 decision: http:// bit.ly/2yeZfrT \n\n#priority ...",498639837,VennerShipley,accordhealthcare_2017-12-11,the equitable legal title conundrum and claiming priority in patent applications read our article following the accord healthcare limited v research corporation technologies inc decision priority,False,True,False,False


In [16]:
OUTPUT_FILE_GZIP = './_sources_final/pickles/pression_sociale_rounds_1_2_3_corrected_search_str_opt_clean_text_dup100_clean_matches.pickle.gzip'

In [17]:
df.to_pickle(OUTPUT_FILE_GZIP, compression='gzip')

### Save control file

In [18]:
#### Not in the tweet nor the user name

In [19]:
len(df[(df['str_in_text'] == False) & (df['str_in_user'] == False) & (df['str_in_conv'] == False)])

215495

In [20]:
no_matches = df[(df['str_in_text'] == False) & (df['str_in_user'] == False) & (df['str_in_conv'] == False)]

In [21]:
no_matches.head()

Unnamed: 0,id,conversation_id,date_end,date_start,datetime,is_quote,is_reply,is_retweet,nbr_favorite,nbr_reply,...,search_string,text,user_id,user_name,string_date,text_clean,dup_100,str_in_text,str_in_user,str_in_conv
50,859880346770395138,859762834015752192,2017-05-15,2017-02-28,2017-05-03 23:20:25,False,True,False,1,0,...,Actavo,@ countryboy606 Fake news....!,2894787904,greenshame,actavo_2017-02-28,countryboy fake news,False,False,False,False
51,859879226215976962,859762834015752192,2017-05-15,2017-02-28,2017-05-03 23:15:57,False,True,False,0,0,...,Actavo,Like the Figures from any Gov department are so Accurate. Garda. Housing. Hospital. Homeless. Gov Spin.,128258493,countryboy606,actavo_2017-02-28,like the figures from any gov department are so accurate garda housing hospital homeless gov spin,False,False,False,False
56,859461174513086464,859439099136139265,2017-05-15,2017-02-28,2017-05-02 19:34:46,False,True,False,2,0,...,Actavo,We like this write up #innovation #creative #development,3212099602,THINK_KLAWZ,actavo_2017-02-28,we like this write up innovation,False,False,False,False
57,859451287217332224,859439099136139265,2017-05-15,2017-02-28,2017-05-02 18:55:29,False,True,False,3,1,...,Actavo,Brilliant write up @TIGER_KLAWZ springs to mind #innovation #creative #development,1277775534,TIGERSALTD,actavo_2017-02-28,brilliant write up springs to mind innovation,False,False,False,False
353,855210317684649984,854810320761745408,2017-05-15,2017-02-28,2017-04-21 02:03:23,False,False,False,0,0,...,Actavo,"Wrexham underway. Tullamore IRE also, others?",331137622,ADXGuy,actavo_2017-02-28,wrexham underway tullamore ire also others,False,False,False,False


In [22]:
OUTPUT_CONTROL_FILE = './_sources_final/controls/04_no_matches.xls'

In [23]:
no_matches[:1000].to_excel(OUTPUT_CONTROL_FILE)