## INSERTING *CSVs* in mongodb for the first time 

In [2]:
import os
import glob
import pandas as pd
from pymongo import MongoClient

# Function to map the structure of a given DataFrame to the common schema
def MapToCommonSchema(DF):
    CommonSchema = {
        'type': None,  
        'postTitle': None,
        'postDesc': None,
        'postTime': None,
        'authorName': None,
        'noOfUpvotes': None,
        'isNSFW': None,
        'comments': None,
        'noOfComments': None,
        'imageUrl': None,
        'postUrl': None,
        'subReddit': None
    }

    if 'type' in DF.columns:
        CommonSchema['type'] = DF['type']
    if 'postTitle' in DF.columns:
        CommonSchema['postTitle'] = DF['postTitle']
    if 'postDesc' in DF.columns:
        CommonSchema['postDesc'] = DF['postDesc']
    if 'postTime' in DF.columns:
        CommonSchema['postTime'] = DF['postTime']
    if 'authorName' in DF.columns:
        CommonSchema['authorName'] = DF['authorName']
    if 'noOfUpvotes' in DF.columns:
        CommonSchema['noOfUpvotes'] = DF['noOfUpvotes']
    if 'isNSFW' in DF.columns:
        CommonSchema['isNSFW'] = DF['isNSFW']
    if 'comments' in DF.columns:
        CommonSchema['comments'] = DF['comments']
    elif all(x in DF.columns for x in ['comment1', 'comment2', 'comment3']):
        CommonSchema['comments'] = DF[['comment1', 'comment2', 'comment3']].apply(lambda row: [x for x in row if pd.notna(x)], axis=1)
    if 'noOfComments' in DF.columns:
        CommonSchema['noOfComments'] = DF['noOfComments']
    if 'imageUrl' in DF.columns:
        CommonSchema['imageUrl'] = DF['imageUrl']
    if 'postUrl' in DF.columns:
        CommonSchema['postUrl'] = DF['postUrl']
    if 'subReddit' in DF.columns:
        CommonSchema['subReddit'] = DF['subReddit']

    return pd.DataFrame(CommonSchema)

# Function to read and process different CSV formats based on the columns present in the CSV
def Reading(file_path):
    DF = pd.read_csv(file_path)
    if 'postTitle' in DF.columns and 'comment1' in DF.columns:
        return MapToCommonSchema(DF)
    elif 'postTitle' in DF.columns and 'comments' in DF.columns:
        return MapToCommonSchema(DF)
    else:
        raise ValueError(f"Unknown CSV format for: {file_path}")

# Connect to MongoDB
def MongoDBConnection():
    client = MongoClient('mongodb://localhost:27017/')
    db = client['DataTails'] 
    collection = db['Data']
    return collection

# Directory containing the CSV files
Dir = './data/'
Files = glob.glob(os.path.join(Dir, '*.csv'))
FinalDF = []
collection = MongoDBConnection()

for File in Files:
    try:
        ProcessedDF = Reading(File)
        FinalDF.append(ProcessedDF)
    except ValueError as e:
        print(e)


CombinedDF = pd.concat(FinalDF, ignore_index=True)
records = CombinedDF.to_dict(orient='records')
collection.insert_many(records)
print(f"Data processed and saved to MongoDB in the 'DataTails.Data' collection.")



Unknown CSV format for: ./data\Subreddits.csv
Data processed and saved to MongoDB in the 'reddit_data.posts' collection.


## Checking for *Duplicates* in MongoDB

In [4]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['DataTails']
collection = db['Data']

# Group by postUrl, postTime, and noOfComments and find duplicates
pipeline = [
    {
        "$group": {
            "_id": {
                "postUrl": "$postUrl",
                "postTime": "$postTime",
                "noOfComments": "$noOfComments"
            },
            "duplicateIds": { "$push": "$_id" },
            "count": { "$sum": 1 }
        }
    },
    {
        "$match": {
            "count": { "$gt": 1 }
        }
    }
]

duplicates = collection.aggregate(pipeline)

# Remove all duplicates except one for each group
for group in duplicates:
    duplicate_ids = group['duplicateIds']
    # Keep the first document, delete the rest
    for doc_id in duplicate_ids[1:]:
        collection.delete_one({'_id': doc_id})

print("Duplicate documents removed.")



Duplicate documents removed.


## Checking for duplicates *before inserting* in MongoDB

In [5]:
import os
import glob
import pandas as pd
from pymongo import MongoClient

# Function to map the structure of a given DataFrame to the common schema
def MapToCommonSchema(DF):
    CommonSchema = {
        'type': None,  
        'postTitle': None,
        'postDesc': None,
        'postTime': None,
        'authorName': None,
        'noOfUpvotes': None,
        'isNSFW': None,
        'comments': None,
        'noOfComments': None,
        'imageUrl': None,
        'postUrl': None,
        'subReddit': None
    }

    if 'type' in DF.columns:
        CommonSchema['type'] = DF['type']
    if 'postTitle' in DF.columns:
        CommonSchema['postTitle'] = DF['postTitle']
    if 'postDesc' in DF.columns:
        CommonSchema['postDesc'] = DF['postDesc']
    if 'postTime' in DF.columns:
        CommonSchema['postTime'] = DF['postTime']
    if 'authorName' in DF.columns:
        CommonSchema['authorName'] = DF['authorName']
    if 'noOfUpvotes' in DF.columns:
        CommonSchema['noOfUpvotes'] = DF['noOfUpvotes']
    if 'isNSFW' in DF.columns:
        CommonSchema['isNSFW'] = DF['isNSFW']
    if 'comments' in DF.columns:
        CommonSchema['comments'] = DF['comments']
    elif all(x in DF.columns for x in ['comment1', 'comment2', 'comment3']):
        CommonSchema['comments'] = DF[['comment1', 'comment2', 'comment3']].apply(lambda row: [x for x in row if pd.notna(x)], axis=1)
    if 'noOfComments' in DF.columns:
        CommonSchema['noOfComments'] = DF['noOfComments']
    if 'imageUrl' in DF.columns:
        CommonSchema['imageUrl'] = DF['imageUrl']
    if 'postUrl' in DF.columns:
        CommonSchema['postUrl'] = DF['postUrl']
    if 'subReddit' in DF.columns:
        CommonSchema['subReddit'] = DF['subReddit']

    return pd.DataFrame(CommonSchema)

# Function to read and process different CSV formats based on the columns present in the CSV
def Reading(file_path):
    DF = pd.read_csv(file_path)
    if 'postTitle' in DF.columns and 'comment1' in DF.columns:
        return MapToCommonSchema(DF)
    elif 'postTitle' in DF.columns and 'comments' in DF.columns:
        return MapToCommonSchema(DF)
    else:
        raise ValueError(f"Unknown CSV format for: {file_path}")

# Connect to MongoDB
def MongoDBConnection():
    client = MongoClient('mongodb://localhost:27017/')
    db = client['DataTails'] 
    collection = db['Data']
    return collection

# Function to insert records into MongoDB while avoiding duplicates
def CheckingDuplicates(collection, records):
    for record in records:
        query = {
            'postUrl': record['postUrl'],
            'postTime': record['postTime'],
            'noOfComments': record['noOfComments']
        }
        # Insert the record if it doesn't already exist in the collection
        if collection.count_documents(query, limit=1) == 0:
            collection.insert_one(record)
        else:
            print(f"Duplicate record found and skipped: {record['postUrl']} at {record['postTime']} with {record['noOfComments']} comments")

# Directory containing the CSV files
Dir = './data/shitba'
Files = glob.glob(os.path.join(Dir, '*.csv'))
FinalDF = []
collection = MongoDBConnection()

for File in Files:
    try:
        ProcessedDF = Reading(File)
        FinalDF.append(ProcessedDF)
    except ValueError as e:
        print(e)

CombinedDF = pd.concat(FinalDF, ignore_index=True)
records = CombinedDF.to_dict(orient='records')

# Insert records into MongoDB, skipping duplicates based on postUrl, postTime, and noOfComments
CheckingDuplicates(collection, records)
print(f"Data processed and saved to MongoDB in the 'DataTails.Data' collection.")


Duplicate record found and skipped: https://www.reddit.com/r/askscience/comments/853m2f/why_do_nuclear_power_plants_have_those_distinct/ at 2018-03-17 18:07:20 with 608 comments
Duplicate record found and skipped: https://www.reddit.com/r/Music/comments/7ekwu9/the_fcc_is_about_to_kill_net_neutrality_were/ at 2017-11-22 01:29:11 with 1742 comments
Duplicate record found and skipped: https://www.reddit.com/r/Music/comments/v424xd/confederate_flags_are_banned_from_cma_country/ at 2022-06-03 20:40:36 with 9185 comments
Duplicate record found and skipped: https://www.reddit.com/r/Music/comments/hbhk01/rick_astley_everlong_foo_fighters_cover_rock/ at 2020-06-18 21:08:49 with 3356 comments
Duplicate record found and skipped: https://www.reddit.com/r/Music/comments/77fow0/the_fcc_is_expected_to_announce_a_vote_to_gut_net/ at 2017-10-19 21:50:50 with 4210 comments
Duplicate record found and skipped: https://www.reddit.com/r/Music/comments/h0l06d/rightwing_fans_mocked_for_boycotting_rage_against

In [8]:
import pandas as pd

df = pd.read_csv("/Users/fasihrem/Downloads/University/Final Year Project/data-tails/Backend/data/macbook/islam_top.csv")
df.shape

(996, 13)

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,type,subReddit,postTitle,postDesc,postTime,authorName,noOfUpvotes,isNSFW,comments,noOfComments,imageUrl,postUrl
0,0,top,islam,Orthodox Jewish man called Andrew who has been...,,2020-07-10 03:37:37,gunnerpuner,8233,False,"['[deleted]', ""Oh, that's both courageous and ...",376,https://i.redd.it/wmr44hbssw951.jpg,https://www.reddit.com/r/islam/comments/hod0bd...
1,1,top,islam,Israeli forces attacked peaceful worshippers h...,,2021-05-08 14:33:51,wyazici,7542,False,['Anyone encouraging brigading of other subs a...,813,https://v.redd.it/wcnoctm29vx61,https://www.reddit.com/r/islam/comments/n7llqj...
2,2,top,islam,A video uploaded and later deleted by a Han Ch...,,2020-08-31 20:19:14,KaraSoy,6992,True,['I can’t watch. I stopped after 24 seconds. I...,1381,https://v.redd.it/ym4pkkttuck51,https://www.reddit.com/r/islam/comments/ijzvzo...
3,3,top,islam,Orthodox Judaism rejects Zionism.,,2021-05-12 16:05:52,hillenium,6247,False,"[""We have to remember that not all Jews are Zi...",304,https://i.redd.it/d1dbuwp79oy61.jpg,https://www.reddit.com/r/islam/comments/nam1ja...
4,4,top,islam,Islamophilia,,2021-04-22 07:57:03,,5904,False,"['Subhanallah', 'That\'s brilliant. Having bee...",107,https://i.redd.it/q57h23yr3nu61.jpg,https://www.reddit.com/r/islam/comments/mvvu22...


In [2]:
from datetime import datetime

# Get today's date
today = datetime.now()

# Format the date
formatted_date = today.strftime('%A %d%b').lower()

print(formatted_date)

tuesday 26nov
