# Twitter Scraping with snscrape

In [None]:
# Instalar snscrape desde git
#pip install git+https://github.com/JustAnotherArchivist/snscrape.git

In [241]:
import os
import pandas as pd
import numpy as np
import re
import requests

In [250]:
def scrape_account(username, results):
    os.system("snscrape --jsonl --max-results " +str(results)+ " twitter-search 'from:"+str(username)+"'> user-tweets.json")
    df = pd.read_json('user-tweets.json', lines=True)
    
    # Transformacion del dataframe
        #Columnas disponibles:
        #['_type', 'url', 'date', 'content', 'renderedContent', 'id', 'user',
        #              'replyCount', 'retweetCount', 'likeCount', 'quoteCount',
        #              'conversationId', 'lang', 'source', 'sourceUrl', 'sourceLabel',
        #              'outlinks', 'tcooutlinks', 'media', 'retweetedTweet', 'quotedTweet',
        #              'inReplyToTweetId', 'inReplyToUser', 'mentionedUsers', 'coordinates',
        #             'hashtags', 'cashtags']
        
    # Filtrar dataframe
    df = df[df['lang'] == 'es']
    df = df[pd.isnull(df['mentionedUsers']) == False]
    df = df[pd.isnull(df['outlinks']) == False]
    df = df[pd.isnull(df['hashtags']) == False]
        
    # Remover columnas innecesarias
    # drop original
    #df = df.drop(['_type','replyCount', 'retweetCount', 'likeCount', 'quoteCount',
    #              'sourceLabel', 'cashtags', 'source','sourceUrl', 'tcooutlinks',
    #              'media','place','coordinates','retweetedTweet','inReplyToTweetId',
    #              'inReplyToUser','lang'], axis = 1)
    
    df = df.drop(['_type','replyCount', 'retweetCount', 'likeCount', 'quoteCount',
                  'sourceLabel', 'cashtags', 'source','sourceUrl', 'tcooutlinks',
                  'media','place','coordinates',
                  'inReplyToUser','lang'], axis = 1)
    
    df = df.reset_index()

    return df

In [108]:
def extract_mention(df):
    mentionedUserNames = []
    for row in range(len(df)):
        mentionedUserName = df['mentionedUsers'][row][0]['username']
        mentionedUserNames.append(mentionedUserName)
    df['mentionedUserName'] = mentionedUserNames
    return df

In [128]:
def extract_picture_url(df):
    UrlImagenes = []
    for row in range(len(df)):
        try:
            UrlImagen = df['quotedTweet'][row]['media'][0]['fullUrl']
            UrlImagenes.append(UrlImagen)
        except:
            try:
                UrlImagen = df['quotedTweet'][row]['media'][0]['thumbnailUrl']
                UrlImagenes.append(UrlImagen)
            except:
                UrlImagenes.append('error')
    
    df['picture_url'] = UrlImagenes
    df = df[df['picture_url'] != 'error']
    df = df.reset_index()
    
    return df

In [129]:
def scrape_tweet(username, results):
    df = scrape_account(username,results)
    df = extract_mention(df)
    df = extract_picture_url(df)
    return df

In [131]:
df = scrape_tweet('Arachno_Cosas',100000)
df.shape

(5939, 15)

## Clean the dataframe

In [None]:
labels = []
patron = r'[N]?[I][M]'

for i in range(len(df)):
    texto = str(df['hashtags'][i])
    try:
        match = re.search(patron, texto)
        hashtag = match.group()
        labels.append(hashtag)
    except:
        labels.append("")

df['label_IM'] = labels
df = df[df['label_IM'] != '']

In [238]:
df = df.reset_index(drop=True)

In [199]:
list(df['hashtags'])[0:10]

[['Corinnidae', 'NIM'],
 ['Thomisidae', 'NIM'],
 ['Oxyopidae', 'Peucetia', 'NIM'],
 ['Vaejovidae', 'Vaejovis', 'NIM'],
 ['Araneidae', 'Eriophora', 'NIM'],
 ['Phidippus', 'NIM'],
 ['Latrodectus', 'IM'],
 ['Aphonopelma', 'NIM'],
 ['Araneidae', 'Araneus', 'NIM'],
 ['Lycosidae', 'NIM']]

In [257]:
df.head(5)

Unnamed: 0,level_0,index,url,date,content,renderedContent,id,user,conversationId,outlinks,quotedTweet,mentionedUsers,hashtags,mentionedUserName,picture_url,label_IM
0,0,1,https://twitter.com/Arachno_Cosas/status/13204...,2021-09-14 13:46:10+00:00,"¡Hola, @TopoGigo6! Gracias por compartir. Pert...","¡Hola, @TopoGigo6! Gracias por compartir. Pert...",1437774676604985351,"{'_type': 'snscrape.modules.twitter.User', 'us...",1437774676604985351,[https://twitter.com/TopoGigo6/status/14376763...,"{'_type': 'snscrape.modules.twitter.Tweet', 'u...","[{'_type': 'snscrape.modules.twitter.User', 'u...","[Corinnidae, NIM]",TopoGigo6,https://pbs.twimg.com/media/E_Om-6dWUAALKL9?fo...,NIM
1,1,3,https://twitter.com/Arachno_Cosas/status/13204...,2021-09-14 13:42:56+00:00,"¡Hola, @Elwestrand! Gracias por compartir. Per...","¡Hola, @Elwestrand! Gracias por compartir. Per...",1437773863627149314,"{'_type': 'snscrape.modules.twitter.User', 'us...",1437773863627149314,[https://twitter.com/Elwestrand/status/1437712...,"{'_type': 'snscrape.modules.twitter.Tweet', 'u...","[{'_type': 'snscrape.modules.twitter.User', 'u...","[Thomisidae, NIM]",Elwestrand,https://pbs.twimg.com/media/E_PH0fbXEAMPYi5?fo...,NIM
2,2,4,https://twitter.com/Arachno_Cosas/status/13204...,2021-09-14 13:39:45+00:00,"¡Hola, @ddiazgar76! Gracias por la consulta. P...","¡Hola, @ddiazgar76! Gracias por la consulta. P...",1437773064843010057,"{'_type': 'snscrape.modules.twitter.User', 'us...",1437773064843010057,[https://twitter.com/ddiazgar76/status/1437753...,"{'_type': 'snscrape.modules.twitter.Tweet', 'u...","[{'_type': 'snscrape.modules.twitter.User', 'u...","[Oxyopidae, Peucetia, NIM]",ddiazgar76,https://pbs.twimg.com/media/E_PtC1OWYAox7UL?fo...,NIM
3,3,6,https://twitter.com/Arachno_Cosas/status/13204...,2021-09-14 13:35:38+00:00,"¡Hola, @adukation! Gracias por tu consulta. Pe...","¡Hola, @adukation! Gracias por tu consulta. Pe...",1437772025574174722,"{'_type': 'snscrape.modules.twitter.User', 'us...",1437772025574174722,[https://twitter.com/adukation/status/14377698...,"{'_type': 'snscrape.modules.twitter.Tweet', 'u...","[{'_type': 'snscrape.modules.twitter.User', 'u...","[Vaejovidae, Vaejovis, NIM]",adukation,https://pbs.twimg.com/ext_tw_video_thumb/14377...,NIM
4,4,7,https://twitter.com/Arachno_Cosas/status/13204...,2021-09-14 13:33:23+00:00,"¡Hola, @rodrigosolomon! Gracias por compartir....","¡Hola, @rodrigosolomon! Gracias por compartir....",1437771461469605899,"{'_type': 'snscrape.modules.twitter.User', 'us...",1437771461469605899,[https://twitter.com/rodrigosolomon/status/143...,"{'_type': 'snscrape.modules.twitter.Tweet', 'u...","[{'_type': 'snscrape.modules.twitter.User', 'u...","[Araneidae, Eriophora, NIM]",rodrigosolomon,https://pbs.twimg.com/media/E_F7p-8UYAAUC45?fo...,NIM


## Download images from dataframe

In [268]:
def download_images(df,url_column):
    for row in range(len(df)):
        response = requests.get(df[str(url_column)][row])
        path = '.\\' + 'Imagenes' + '\\' + str(df['id'][row]) + '.png' 
        file = open(path,'wb')
        file.write(response.content)
        file.close()

In [269]:
download_images(df,'picture_url')

## Save the dataframe to a csv and a pickle file

In [168]:
df.to_csv(r'C:\Users\eddso\OneDrive\Maestría\Tareas Git Hub\tareas_data_science\statistical-learning-2\proyecto-final\twitter-scrapping\df.csv',
          sep = ';')

In [265]:
df_p = pd.read_pickle(r'C:\Users\eddso\OneDrive\Maestría\Tareas Git Hub\tareas_data_science\statistical-learning-2\proyecto-final\twitter-scrapping\df.pickle')

In [267]:
df_p.shape

(5578, 16)