In [1]:
# Importing required libraries

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from pymongo import MongoClient

In [2]:
# Loading raw csv data file

df = pd.read_csv("chatgpt1.csv")

In [3]:
# To extract value from Source field by eliminating html tags

def extract_source(html_tag):
    tag_info = BeautifulSoup(html_tag)
    tag = tag_info.find('a')
    source = tag.contents[0]
    return source

df['Source'] = df['Source'].apply(extract_source)

In [4]:
# Changing datetime column to datetime representation

df['Datetime'] = df['Datetime'].apply(lambda x: x.split("+")[0])
df['Datetime'] = pd.to_datetime(df['Datetime'])

In [5]:
# Lowering cases for hashtags and converting them to list

df['hashtag'] = df['hashtag'].apply(lambda x: eval(x.lower()))

In [6]:
# Extract list of mentioned users from the given user object list string

def create_users_list(user_object):
    if user_object is np.nan:
        return []
    users = []
    a = user_object.split("username=")[1:]
    for word in a:
        u = word.split(", ")[0][1:-1]
        users.append(u)
    return users

df.MentionedUsers = df.MentionedUsers.apply(create_users_list)

## Document Structure

<img src="https://file.notion.so/f/s/6636567c-eb71-4551-998a-36f25f4ddec3/Untitled.png?id=46dcf59e-ee26-4a1d-a66e-d5c3ef1cc3e2&table=block&spaceId=001cacfe-de91-4d06-ad67-71621a69c7f9&expirationTimestamp=1682028873140&signature=vzPtzlpdGGMKeRUNay4cy5EgfAUtuMnAxhI8-tYw7aw&downloadName=Untitled.png">

In [7]:
df.head()

Unnamed: 0,Datetime,Tweet Id,Text,Username,Permalink,User,Outlinks,CountLinks,ReplyCount,RetweetCount,LikeCount,QuoteCount,ConversationId,Language,Source,Media,QuotedTweet,MentionedUsers,hashtag,hastag_counts
0,2023-01-22 13:44:34,1617156270871699456,ChatGPTで遊ぶの忘れてた！！\n書類作るコード書いてみてほしいのと、\nどこまで思考整...,mochico0123,https://twitter.com/mochico0123/status/1617156...,https://twitter.com/mochico0123,,,1,0,5,0,1617156270871699456,ja,Twitter for iPhone,,,[],[],0
1,2023-01-22 13:44:39,1617156291046133761,@AlexandrovnaIng Prohibition of ChatGPT has be...,Caput_LupinumSG,https://twitter.com/Caput_LupinumSG/status/161...,https://twitter.com/Caput_LupinumSG,,,1,0,5,0,1617148639993806848,en,Twitter for iPhone,,,[AlexandrovnaIng],[],0
2,2023-01-22 13:44:44,1617156308926349312,"Schaut Euch an, was @fobizz @DianaKnodel alles...",ciffi,https://twitter.com/ciffi/status/1617156308926...,https://twitter.com/ciffi,['https://us02web.zoom.us/webinar/register/801...,"['https://t.co/DsoeVJrPBp', 'https://t.co/HflT...",0,0,4,0,1617156308926349312,de,Twitter for iPad,[Photo(previewUrl='https://pbs.twimg.com/media...,https://twitter.com/DianaKnodel/status/1617153...,"[fobizz, DianaKnodel, reg_schulz, EskenSaskia]",[#chatgpt],1
3,2023-01-22 13:44:49,1617156332297256961,Bow down to chatGPT 🫡..... https://t.co/ENTSzi...,Vishwasrisiri,https://twitter.com/Vishwasrisiri/status/16171...,https://twitter.com/Vishwasrisiri,['https://twitter.com/agadmator/status/1617155...,['https://t.co/ENTSzi2AQ9'],0,0,2,0,1617156332297256961,en,Twitter for Android,,https://twitter.com/agadmator/status/161715501...,[],[],0
4,2023-01-22 13:44:52,1617156345064570880,"Profilinde vatan, Türkiye falan yazan bireyler...",0xGenetikciniz,https://twitter.com/0xGenetikciniz/status/1617...,https://twitter.com/0xGenetikciniz,,,0,0,4,0,1617156345064570880,tr,Twitter for iPhone,,,[],[],0


In [8]:
# Converting DataFrame to a dict like our document structure

out = {}
for row in df.itertuples():
    a = row
    if a.Username in out:
        k = out[a.Username]['tweets']
        t = {}
        t['Datetime'] = a.Datetime
        t['tweet_id'] = a._2
        t['Text'] = a.Text
        t['Permalink'] = a.Permalink
        t['Outlinks'] = [] if type(a.Outlinks)==float else eval(a.Outlinks)
        t['Countlinks'] = [] if type(a.CountLinks)==float else eval(a.CountLinks)
        t['ReplyCount'] = a.ReplyCount
        t['RetweetCount'] = a.RetweetCount
        t['LikeCount'] = a.LikeCount
        t['QuoteCount'] = a.QuoteCount
        t['ConversationId'] = a.ConversationId
        t['Language'] = a.Language
        t['Source'] = a.Source
        t['Media'] = a.Media
        t['QuotedTweet'] = a.QuotedTweet
        t['MentionedUsers'] = a.MentionedUsers
        t['hashtags'] = a.hashtag
        t['hashtagCount'] = a.hastag_counts
        k.append(t)
    else:
        k = {}
        k["Username"] = a.Username
        k["UserURL"] = a.User
        t = {}
        t['Datetime'] = a.Datetime
        t['tweet_id'] = a._2
        t['Text'] = a.Text
        t['Permalink'] = a.Permalink
        t['Outlinks'] = [] if type(a.Outlinks)==float else eval(a.Outlinks)
        t['Countlinks'] = [] if type(a.CountLinks)==float else eval(a.CountLinks)
        t['ReplyCount'] = a.ReplyCount
        t['RetweetCount'] = a.RetweetCount
        t['LikeCount'] = a.LikeCount
        t['QuoteCount'] = a.QuoteCount
        t['ConversationId'] = a.ConversationId
        t['Language'] = a.Language
        t['Source'] = a.Source
        t['Media'] = a.Media
        t['QuotedTweet'] = a.QuotedTweet
        t['MentionedUsers'] = a.MentionedUsers
        t['hashtags'] = a.hashtag
        t['hashtagCount'] = a.hastag_counts
        k['tweets'] = [t]
        out[a.Username] = k


In [9]:
# Verification -> output must be 60

len(out['translation_ja']['tweets'])

60

In [11]:
full_doc = [doc for doc in out.values()]

In [12]:
# Pushing the data to MongoDB Server

# Connect to the MongoDB server

URL = "MONGO_SERVER_URL"
client = MongoClient(URL)
db = client.chatgpt
collection = db.user_tweets

print("Connection to MongoDB Server Established ..!\n")
    
# Insert the documents into MongoDB using the Bulk Insert API

result = collection.insert_many(full_doc)
answer = len(result.inserted_ids)
print(f"Inserted {answer} documents into the collection")

Connection to MongoDB Server Established ..!

Inserted 38433 documents into the collection
