In [1]:
import weaviate 
from weaviate.connect import ConnectionParams
from weaviate.classes.config import Configure
import json
from weaviate.classes.config import Property, DataType
from weaviate.classes.init import AdditionalConfig, Timeout
from weaviate.classes.query import Filter
import pandas as pd
import os
from dotenv import load_dotenv



In [2]:
df = pd.read_json("video_game_reviews.json", lines=True)
df.info()

df.columns = df.columns.str.replace(" ", "_").str.replace(r"\(hours\)", "", regex=True)

# Optionally, remove any remaining leading or trailing underscores if needed
df.columns = df.columns.str.strip("_")



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47774 entries, 0 to 47773
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Game Title               47774 non-null  object 
 1   User Rating              47774 non-null  float64
 2   Age Group Targeted       47774 non-null  object 
 3   Price                    47774 non-null  float64
 4   Platform                 47774 non-null  object 
 5   Requires Special Device  47774 non-null  object 
 6   Developer                47774 non-null  object 
 7   Publisher                47774 non-null  object 
 8   Release Year             47774 non-null  int64  
 9   Genre                    47774 non-null  object 
 10  Multiplayer              47774 non-null  object 
 11  Game Length (Hours)      47774 non-null  float64
 12  Graphics Quality         47774 non-null  object 
 13  Soundtrack Quality       47774 non-null  object 
 14  Story Quality         

In [4]:
# Rename specific columns in the DataFrame
df = df.rename(columns={"Game_Length_(Hours)": "Game_Length"})

# Generate properties for the schema based on the renamed DataFrame columns
properties = []
for col in df.columns:
    # Determine the data type for each column
    if pd.api.types.is_numeric_dtype(df[col]):
        data_type = "number"  # Use "int" for integers, "number" for floats
    else:
        data_type = "text"

    # Append property to the properties list
    properties.append({
        "name": col,
        "dataType": [data_type],
        "description": f"The {col} of the video game."
    })

# Create the schema JSON
weaviate_properties = properties
schema = {
    "classes": [
        {
            "class": "VideoGamex",
            "description": "A collection of video game data including reviews and ratings.",
            "properties": weaviate_properties
        }
    ]
}

# Print schema as JSON
print(json.dumps(schema, indent=2))

{
  "classes": [
    {
      "class": "VideoGamex",
      "description": "A collection of video game data including reviews and ratings.",
      "properties": [
        {
          "name": "Game_Title",
          "dataType": [
            "text"
          ],
          "description": "The Game_Title of the video game."
        },
        {
          "name": "User_Rating",
          "dataType": [
            "number"
          ],
          "description": "The User_Rating of the video game."
        },
        {
          "name": "Age_Group_Targeted",
          "dataType": [
            "text"
          ],
          "description": "The Age_Group_Targeted of the video game."
        },
        {
          "name": "Price",
          "dataType": [
            "number"
          ],
          "description": "The Price of the video game."
        },
        {
          "name": "Platform",
          "dataType": [
            "text"
          ],
          "description": "The Platform of the video

In [5]:
load_dotenv()

# Retrieve the variables
wcd_url = os.getenv("WCD_URL")
wcd_api_key = os.getenv("WCD_API_KEY")

# Connect to your Weaviate instance
client = weaviate.Client(
    url=wcd_url,
    auth_client_secret=weaviate.AuthApiKey(wcd_api_key)
)

# Register the schema
client.schema.create(schema)

Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  client = weaviate.Client(


In [6]:
objects = []

for _, row in df.iterrows():
    obj = {
        "class": "VideoGame",
        "properties": row.to_dict()  # Converts the row to a dictionary of properties
    }
    objects.append(obj)

In [7]:
with client.batch as batch:
    batch.batch_size = 100  # Set your batch size (e.g., 100 objects per batch)
    for obj in objects:
        batch.add_data_object(obj["properties"], obj["class"])
    print("Data ingestion complete.")

Data ingestion complete.
