In [1]:
import pandas as pd
import re

In [2]:
# Loading the embeddings.csv file into a pandas DataFrame
embeddings_path = 'embeddings.csv'
embeddings_df = pd.read_csv(embeddings_path)

# Displaying the first few rows of the embeddings DataFrame
embeddings_df.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,linkedid,text,embedding
0,3984,3984,1689949000.0,- алло здравствуйте здравствуйте меня зовут ир...,[-1.02541447e-01 -1.69905424e-01 -5.45022339e-...
1,3985,3985,1689949000.0,- здравствуйте я бы мастера вызвать подвесной ...,[-7.03098401e-02 9.54969972e-02 1.09445930e-...
2,3986,3986,1689949000.0,- але\n - алло здравствуйте это сервисный цент...,[ 2.21290793e-02 -1.24347601e-02 1.06239542e-...


In [3]:
# Reloading the linkedid_cat.txt file by skipping the first row
linkedid_cat_path = 'linkedid_cat.txt'
linkedid_cat_df = pd.read_csv(linkedid_cat_path, delimiter='\t', skiprows=1, names=['linkedid', 'cat'])

# Displaying the first few rows of the linkedid_cat DataFrame
linkedid_cat_df.head(3)

Unnamed: 0,linkedid,cat
0,1689907497.8471165,
1,1689907834.8471172,
2,1689907883.8471184,


In [4]:
# Convert the 'linkedid' column to a string type to handle any NA / NaN values
linkedid_cat_df['linkedid'] = linkedid_cat_df['linkedid'].astype(str)
# Identify rows containing ".WAV" in the 'linkedid' column
rows_with_wav = linkedid_cat_df['linkedid'].str.endswith('.WAV')
# Remove ".WAV" from the 'linkedid' column
linkedid_cat_df.loc[rows_with_wav, 'linkedid'] = linkedid_cat_df.loc[rows_with_wav, 'linkedid'].str[:-4]

In [5]:
# Renaming the 'Linkedid' column to 'linkedid' to match the column name in embeddings_df
# linkedid_cat_df.rename(columns={'Linkedid': 'linkedid'}, inplace=True)

# Converting the 'linkedid' columns to float for proper join operation
embeddings_df['linkedid'] = embeddings_df['linkedid'].astype(float)
linkedid_cat_df['linkedid'] = linkedid_cat_df['linkedid'].astype(float)

In [6]:
# Joining the embeddings_df and linkedid_cat_df DataFrames on the 'linkedid' column
joined_df = pd.merge(embeddings_df, linkedid_cat_df, on='linkedid', how='left')

In [7]:
joined_df = joined_df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

In [8]:
# Replacing NaN values in the 'cat' column with 0
joined_df['cat'] = joined_df['cat'].fillna(0)

In [9]:
joined_df.head(3)

Unnamed: 0,linkedid,text,embedding,cat
0,1689949000.0,- алло здравствуйте здравствуйте меня зовут ир...,[-1.02541447e-01 -1.69905424e-01 -5.45022339e-...,0.0
1,1689949000.0,- здравствуйте я бы мастера вызвать подвесной ...,[-7.03098401e-02 9.54969972e-02 1.09445930e-...,0.0
2,1689949000.0,- але\n - алло здравствуйте это сервисный цент...,[ 2.21290793e-02 -1.24347601e-02 1.06239542e-...,0.0


In [10]:
categories = {
    0:'-',
    1:'Заявка',
    2:'Заявка не создана',
    3:'Повторный звонок по заявке',
    4:'Доп. услуги нашей компании',
    5:'Некоммерческая деятельность, ошибки'
}    

In [11]:
joined_df.to_csv('joined_df.csv')

In [12]:
# Filtering the DataFrame to retain only the rows where the "cat" column is either 1 or 2
filtered_df = joined_df[joined_df['cat'].isin([1, 2])]
filtered_df.to_csv('filtered_df.csv')
filtered_df.head(3)

Unnamed: 0,linkedid,text,embedding,cat
13,1689949000.0,- здравствуйте вы знаете растирала машина все ...,[ 7.94982072e-03 -8.40310380e-02 2.50265837e-...,1.0
36,1689949000.0,- здравствуйте сервисный центр оператор елена ...,[-1.54398039e-01 -1.08174019e-01 -9.63503420e-...,1.0
62,1689949000.0,- алло\n - добрый вечер мастер по поводу плиты...,[ 5.40778376e-02 1.38826340e-01 3.35334651e-...,1.0


1 is bid  
2 is no bid

In [13]:
len(filtered_df)

91

### Create embeddings columns

In [14]:
# Defining a function to extract the float values from the string representation
def extract_values(embedding_str):
    # Using regular expression to find all floating-point numbers in the string
    values = re.findall(r'[-+]?\d*\.\d+e[+-]\d+|[-+]?\d+\.\d+|\d+', embedding_str)
    return [float(value) for value in values]

# Applying the function to the "embedding" column to create a DataFrame with new columns
embedding_values_df = filtered_df['embedding'].apply(extract_values).apply(pd.Series)

# Renaming the columns to "emb_0," "emb_1," etc.
embedding_values_df.columns = [f'emb_{i}' for i in range(embedding_values_df.shape[1])]

# Concatenating the new columns with the original DataFrame
final_df = pd.concat([filtered_df.drop(columns=['embedding']), embedding_values_df], axis=1)

# Displaying the first few rows of the final DataFrame
final_df.head()

Unnamed: 0,linkedid,text,cat,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
13,1689949000.0,- здравствуйте вы знаете растирала машина все ...,1.0,0.00795,-0.084031,0.250266,0.028654,0.025751,0.016948,0.231806,...,-0.059023,0.078886,-0.044686,0.036361,0.010082,-0.059249,-0.174664,0.259075,-0.167563,-0.208279
36,1689949000.0,- здравствуйте сервисный центр оператор елена ...,1.0,-0.154398,-0.108174,-0.09635,-0.018527,-0.18809,0.05728,0.063462,...,0.067114,0.078061,0.081421,-0.048866,-0.057605,0.079759,-0.150723,0.013828,-0.062318,-0.025789
62,1689949000.0,- алло\n - добрый вечер мастер по поводу плиты...,1.0,0.054078,0.138826,0.033533,0.109747,-0.035089,0.015633,0.289364,...,-0.102859,0.104776,0.040318,-0.161395,-0.093331,0.007951,-0.020618,0.1315,-0.067704,-0.038313
69,1689949000.0,- оно\n - здравствуйте звоню вам из компании а...,1.0,-0.021159,0.135748,0.017591,-0.003244,-0.090894,-0.108294,0.235269,...,-0.037519,0.029865,0.242921,-0.046685,0.048737,-0.010472,0.013453,0.149711,-0.001337,-0.013312
91,1689949000.0,- юлия добрый день а вы знаете вот у меня ваша...,1.0,-0.032377,-0.075089,-0.04236,-0.001788,-0.103818,0.019969,0.079607,...,-0.019129,0.147747,0.011983,-0.119939,-0.10313,0.008909,-0.122116,0.07986,-0.120007,-0.164572
