In [97]:
import requests
import pandas as pd
import time
import random
import re
import numpy as np
import _pickle as pickle
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup as bs
from scipy.stats import halfnorm

In [153]:
from itertools import permutations
 
# Get all permutations of length 4
categories = ["Style"]
perm = permutations(["Classic", "Elegant", "Dramatic", "Feminine", "Sexy", "Masculine", "Romantic", "Casual", "Streetwear", "Glam", "Minimalist", "Vintage", "Boho", "Editorial", "Androgynous", "Edgy", "Preppy", "Maximalist"], 4)
styles = []
# Print the obtained permutations
for i in list(perm):
    styles.append(', '.join(i))
# styles = np.array(styles)
# Creating a DF from the style permutations
random.shuffle(styles)
styles = pd.DataFrame(styles, columns=["Profiles"])

In [154]:
styles

Unnamed: 0,Profiles
0,"Feminine, Boho, Preppy, Androgynous"
1,"Boho, Classic, Maximalist, Masculine"
2,"Masculine, Glam, Romantic, Sexy"
3,"Androgynous, Glam, Streetwear, Dramatic"
4,"Androgynous, Glam, Dramatic, Romantic"
...,...
73435,"Maximalist, Elegant, Boho, Dramatic"
73436,"Elegant, Editorial, Androgynous, Preppy"
73437,"Editorial, Casual, Edgy, Boho"
73438,"Streetwear, Dramatic, Classic, Masculine"


In [155]:
# Creating a DF of the style categories
style_df = pd.DataFrame(columns=categories)
print(style_df)
# Filling in Data
for i in style_df.columns:
    
    # Range of numbers to represent different labels in each category
    style_df[i] = np.random.randint(0,18, styles.shape[0])
    
    # Logic: The numbers represent a specific choice within the categories
    # So your number 1 preferred category will have a one and the least will have 18 etc.

Empty DataFrame
Columns: [Style]
Index: []


In [156]:
style_df

Unnamed: 0,Style
0,13
1,9
2,4
3,13
4,4
...,...
73435,15
73436,15
73437,8
73438,16


In [157]:
# Joining the two dataframes
final_df = styles.join(style_df)
final_df

Unnamed: 0,Profiles,Style
0,"Feminine, Boho, Preppy, Androgynous",13
1,"Boho, Classic, Maximalist, Masculine",9
2,"Masculine, Glam, Romantic, Sexy",4
3,"Androgynous, Glam, Streetwear, Dramatic",13
4,"Androgynous, Glam, Dramatic, Romantic",4
...,...,...
73435,"Maximalist, Elegant, Boho, Dramatic",15
73436,"Elegant, Editorial, Androgynous, Preppy",15
73437,"Editorial, Casual, Edgy, Boho",8
73438,"Streetwear, Dramatic, Classic, Masculine",16


In [158]:
with open("profiles.pkl", "wb") as fp:
    pickle.dump(final_df, fp)

In [159]:
with open("profiles.pkl",'rb') as fp:
    df = pickle.load(fp)

In [160]:
final_df.to_csv (r'profiles_dataframe.csv', index = False, header=True)

In [161]:
# Removing the numerical data
df = df[['Profiles']]

In [162]:
from numpy.ma.core import default_fill_value
# Probability dictionary
p = {}
style_types = ["Classic", "Elegant", "Dramatic", "Feminine", "Sexy", "Masculine", 
               "Romantic", "Casual", "Streetwear", "Glam", "Minimalist", "Vintage", 
               "Boho", "Editorial", "Androgynous", "Edgy", "Preppy", "Maximalist"]

p['style'] = [0.11, 0.10, 0.07, 0.13, 0.04, 0.06,
              0.06, 0.04, 0.01, 0.01, 0.10, 0.03,
              0.03, 0.07, 0.02, 0.04, 0.01, 0.07]

# Age (generating random numbers based on half normal distribution)
age = halfnorm.rvs(loc=18,scale=8, size=df.shape[0]).astype(int)
# gender = pd.DataFrame(columns=["Gender"])
gender = []
for i in range(df.shape[0]):
    # Range of numbers to represent different labels in each category
    number = random.randint(0, 2)
    if number == 1:
      gender.append("Female")
    elif number == 2:
      gender.append("Non-Binary")
    else:
      gender.append("Male")
gender = pd.DataFrame(gender, columns=["Gender"])

In [163]:
df

Unnamed: 0,Profiles
0,"Feminine, Boho, Preppy, Androgynous"
1,"Boho, Classic, Maximalist, Masculine"
2,"Masculine, Glam, Romantic, Sexy"
3,"Androgynous, Glam, Streetwear, Dramatic"
4,"Androgynous, Glam, Dramatic, Romantic"
...,...
73435,"Maximalist, Elegant, Boho, Dramatic"
73436,"Elegant, Editorial, Androgynous, Preppy"
73437,"Editorial, Casual, Edgy, Boho"
73438,"Streetwear, Dramatic, Classic, Masculine"


In [164]:
final_categories = [style_types, age, gender]
names = ["Style", "Age", "Gender"]
combined = dict(zip(names, final_categories))

### Make random values for every category

In [165]:
# Looping through and assigning random values
for name, categories in combined.items():
    if name == 'Age' or name == 'Gender':
        # Generating random ages based on a normal distribution
        df[name] = categories
    else:
        # Picking 3 from the list 
        try:
            df[name] = list(np.random.choice(categories, size=(df.shape[0],1,3), p=p[name]))
        except:
            df[name] = list(np.random.choice(categories, size=(df.shape[0],1,3)))
        
        df[name] = df[name].apply(lambda x: list(set(x[0].tolist())))
        

In [166]:
df

Unnamed: 0,Profiles,Style,Age,Gender
0,"Feminine, Boho, Preppy, Androgynous","[Edgy, Masculine]",18,Female
1,"Boho, Classic, Maximalist, Masculine","[Boho, Romantic]",20,Non-Binary
2,"Masculine, Glam, Romantic, Sexy","[Streetwear, Feminine, Masculine]",22,Female
3,"Androgynous, Glam, Streetwear, Dramatic","[Casual, Romantic, Editorial]",21,Male
4,"Androgynous, Glam, Dramatic, Romantic","[Preppy, Masculine]",29,Non-Binary
...,...,...,...,...
73435,"Maximalist, Elegant, Boho, Dramatic","[Classic, Editorial]",23,Male
73436,"Elegant, Editorial, Androgynous, Preppy","[Preppy, Feminine, Elegant]",20,Female
73437,"Editorial, Casual, Edgy, Boho","[Androgynous, Feminine, Minimalist]",23,Male
73438,"Streetwear, Dramatic, Classic, Masculine","[Boho, Maximalist, Androgynous]",20,Male


In [167]:
with open("refined_profiles.pkl",'wb') as fp:
    pickle.dump(df, fp)