#Dataset Cleaning

Dataset: https://research.signal-ai.com/datasets/signal1m.html

In [None]:
!pip install jsonlines

from google.colab import drive
drive.mount('/content/drive')


import jsonlines
import pandas as pd  # If you're working with data in a DataFrame

# file path cris
file_path = '/content/drive/MyDrive/Advanced Information Retrieval - Trabajo/train.jsonl'

# initializing the dataset
data = []

# Read jsonl file
with jsonlines.open(file_path, 'r') as reader:
    for line in reader:
        data.append(line)


Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0
Mounted at /content/drive


In [None]:

df = pd.DataFrame(data)


#Step 1: Reduce the DataFrame to lines where the 'query' column repeats more than 5 times.
# Filter the DataFrame to lines where 'query' repeats more than 5 times
filtered_queries = df[df.groupby('query')['query'].transform('count') > 5]

filtered_queries
# Display the reduced DataFrame
print(len(filtered_queries))
filtered_queries

357736


Unnamed: 0,_id,title,text,query
51,638501434560876544,,September Starts With A Sizzle: We’re putting ...,when is september
52,638501434560876544,,September Starts With A Sizzle: We’re putting ...,when is the end of september
53,638501434560876544,,September Starts With A Sizzle: We’re putting ...,when does september start
111,638501569130950656,,West Nile virus found across DeKalb County:,what county is dekalb in
159,638501432094670848,,European efforts to stem migrant tide sow chao...,hungarian border crossing
...,...,...,...,...
8416242,640219441700147202,,#BREAKING: Police say missing 11-year-old has ...,how old is the missing girl
8416278,649061773820174340,,Sen. Bong Revilla admits that he wants to run ...,who is running for president
8416353,643673680829149184,,"@BBCGaryR @BBCNews I am a migrant, there is no...",is there a migrant crisis
8416363,645380013034835968,,@TyTheWeatherGuy @CStewartWPTV a win is a win ...,what is a win


In [None]:
#Step 2: Create the df_texts DataFrame with the documents itself.
df_texts = filtered_queries.drop_duplicates(subset='text')[['_id', 'text']].reset_index(drop=True).rename(columns={'text': 'docs'})
df_texts.index.name = 'id_document'
#Step 3: Create the df_query DataFrame.
df_query = pd.DataFrame(filtered_queries['query'].unique(), columns=['query']).reset_index(drop=True)
df_query.index.name = 'id_query'

In [None]:
#checking the error in the dataset --> this doc is not included  in the query when is september
filtered_queries[filtered_queries['text'] == "A hot start to September tomorrow! I'll show you how long the heat streak will last from tonight's CTV News at Six."]

Unnamed: 0,_id,title,text,query
1210,638502758006566912,,A hot start to September tomorrow! I'll show y...,what's the weather like in september


In [None]:
filtered_queries[filtered_queries['query'] == "when is september"]

Unnamed: 0,_id,title,text,query
51,638501434560876544,,September Starts With A Sizzle: We’re putting ...,when is september
13967,638519212491804672,,September is National Preparedness Month. 2015...,when is september
46417,638555941454000128,,Good morning! Happy September everyone! Have a...,when is september
52348,638564084586315776,,We made it. It's September.,when is september
54844,638567334148505600,,"September, then. Let's give September a try.",when is september
55792,638568397048020993,,RT @MRotellaWx: Happy September!,when is september
56271,638569352934072320,,September is here and so am I at the trav desk...,when is september
57358,638569963670769668,,Come September!! Gud morn,when is september
94647,638611766663073797,,#RipLesVacances #SeptemberIssue #augustisgone ...,when is september
100544,638618208367431681,,"Happy September 1st to all my sweet tweeters ,...",when is september


In [None]:
#Step 4 creating qrels, each query and the relevant documents
qrels = {}
for i,q in enumerate(df_query['query']):
    docs = filtered_queries[filtered_queries['query'] == q]
    index = docs['_id']
    # Extract the indices for the specified '_id' values
    indices_for_specified_ids = df_texts[df_texts['_id'].isin(index)].index
    qrels[i] = indices_for_specified_ids.tolist()
qrels

{0: [0,
  357,
  1216,
  1373,
  1430,
  1448,
  1460,
  1488,
  2594,
  2784,
  3602,
  3840,
  3880,
  3894,
  4039,
  4058,
  4362,
  4434,
  4561,
  4568,
  4625,
  4797,
  5003,
  5485,
  5515,
  5518,
  5664,
  5757,
  5995,
  7106,
  7476,
  7525,
  9038,
  12130,
  23576,
  29110,
  31174,
  49737,
  62384,
  73104,
  102576,
  103077,
  121745,
  130666,
  145450,
  146944,
  172072,
  183603,
  184760,
  238661,
  248938],
 1: [0, 83973, 132031, 210321, 239534, 255446],
 2: [0, 5326, 7397, 57994, 103641, 181347],
 3: [1,
  106,
  217,
  629,
  1325,
  6634,
  9134,
  10795,
  17114,
  22173,
  83036,
  132877,
  151447,
  174505,
  208074,
  246439],
 4: [2, 46184, 129431, 131155, 139634],
 5: [3, 1648, 2799, 196041, 208937, 209001, 209018, 209057, 209072],
 6: [4,
  17474,
  68607,
  86592,
  140519,
  153298,
  199848,
  201097,
  210864,
  217170,
  243908],
 7: [5,
  9341,
  14069,
  50397,
  52814,
  53157,
  89283,
  106459,
  120981,
  174533,
  175484,
  175833,
  176

In [None]:
#Step5 saving the data to use it in the rest of the python files
import csv
# Define the CSV file path
csv_file_path = "/content/qrels.csv"

# Write the dictionary to the CSV file
with open(csv_file_path, 'w', newline='') as csvfile:
    fieldnames = ['Key', 'Values']
    csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write the header
    csvwriter.writeheader()

    # Write each row
    for key, values in qrels.items():
        csvwriter.writerow({'Key': key, 'Values': values})


# Save the DataFrame to the CSV file
df_query.to_csv("/content/queries.csv", index=True)
df_texts.to_csv("/content/docs.csv", index=True)

# Move the CSV files to Google Drive
!mv "/content/qrels.csv" "/content/drive/MyDrive/Advanced Information Retrieval - Trabajo/CLEAN DATASETS/qrels.csv"
!mv "/content/queries.csv" "/content/drive/MyDrive/Advanced Information Retrieval - Trabajo/CLEAN DATASETS/queries.csv"
!mv "/content/docs.csv" "/content/drive/MyDrive/Advanced Information Retrieval - Trabajo/CLEAN DATASETS/docs.csv"