In [1]:
import re

import pandas as pd 
from langchain_community.vectorstores import FAISS


In [2]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver("bolt://192.168.0.131:7687", auth=("cc", "Tt66668888."))
def execute_cypher(session, cypher_statement):
    session.run(cypher_statement)



In [3]:
import re
def sanitize_name(name):
    sanitized = re.sub(r'\W+', '_', name)
    if re.match(r'^\d', sanitized):
        sanitized = 'N_' + sanitized
    return sanitized

def create_cypher(file_name, sheet_name, df, tertiary_node_col):
    # 创建一级节点（文件名）
    cypher_file = f"MERGE (f:SourceFile {{name: '{sanitize_name(file_name)}'}})"
    
    # 创建二级节点（表格名称）并与一级节点关联
    cypher_sheet = f"""
    MERGE (s:SourceSheet {{name: '{sanitize_name(sheet_name)}'}})
    MERGE (s)-[:BELONGS_TO]->(f)
    """
    
    # 创建三级节点（定义的列的值）及其属性，并与二级节点关联
    relationships = []
    for index, row in df.iterrows():
        tertiary_node_value = row[tertiary_node_col]
        if pd.notna(tertiary_node_value) and tertiary_node_value != '':
            tertiary_node_name = sanitize_name(f"{tertiary_node_value}_{index}")
            relationships.append(f"""
            MERGE (t_{index}:{sanitize_name(tertiary_node_col)} {{name: '{tertiary_node_value}'}})
            MERGE (t_{index})-[:{sanitize_name(tertiary_node_col)}]->(s)
            """)
            for column, value in row.items():
                if column != tertiary_node_col and pd.notna(value) and value != '':
                    property_name = sanitize_name(column)
                    relationships.append(f"""
                    MERGE (p_{index}_{property_name}:{property_name} {{value: '{value}'}})
                    MERGE (t_{index})-[:{property_name}]->(p_{index}_{property_name})
                    """)
    
    # 组合所有 Cypher 语句
    cypher = cypher_file + cypher_sheet + ''.join(relationships)
    
    return cypher

In [4]:
product_sheets = ['SE预约+集控+虚拟分控（5间）','SE预约+集控+虚拟分控（40间）','SE预约+集控+硬件分控（40间）','SE仅预约含一键入会（20间）','SE仅预约无一键入会（40间）','ezCloud预约+集控+硬件分控（50间）','ezCloud仅预约无一键入会（100间）','SOFTAVC体验点','文体版WIFI（1间）','文体版5G（1间）']
with driver.session() as session:
    for num,sheet in enumerate(product_sheets):
        df = pd.read_excel('/mnt/d/project/zzbc/experiment_project/experiment_project/audio/data/audio/excloud/ezCloud.xlsx',sheet_name=sheet)
        cypher_statement = create_cypher(file_name='ez_cloud', sheet_name=sheet,df = df,tertiary_node_col='设备名称')
        
        execute_cypher(session, cypher_statement)
        print(f'完成入库 {sheet}   剩余  {len(product_sheets)-num}',)

完成入库 SE预约+集控+虚拟分控（5间）   剩余  11
完成入库 SE预约+集控+虚拟分控（40间）   剩余  10
完成入库 SE预约+集控+硬件分控（40间）   剩余  9
完成入库 SE仅预约含一键入会（20间）   剩余  8
完成入库 SE仅预约无一键入会（40间）   剩余  7
完成入库 ezCloud预约+集控+硬件分控（50间）   剩余  6
完成入库 ezCloud仅预约无一键入会（100间）   剩余  5
完成入库 SOFTAVC体验点   剩余  4
完成入库 文体版WIFI（1间）   剩余  3
完成入库 文体版5G（1间）   剩余  2


ValueError: Worksheet named '' not found

In [6]:
plan_sheets = ['模板说明']
df = pd.read_excel('/mnt/d/project/zzbc/experiment_project/experiment_project/audio/data/audio/excloud/ezCloud.xlsx',sheet_name=plan_sheets[0])
df

Unnamed: 0,模板名称,ezCloud版本,适配房间数,备注（注意事项）,预约,集控,分控,CS传感器,CTS控制屏,CTS信息屏,MT会议主机,MS集控主机
0,全套预约+集控,,,,,,,,,,,
1,SOFTAVC体验点,SaaS版,任意,采用云端ezCloud SaaS版本，与DP120智慧中控物联系统,✅,✅,✅,✅,✅,✅,✅,❌
2,SE预约+集控+虚拟分控（5间）,SE,5~20间,MS100作为虚拟分控和SE主机，性价比高,✅,✅,✅虚,✅,✅,✅,✅,✅
3,SE预约+集控+虚拟分控（40间）,SE,21~40间,MS100作为SE主机、DP120E虚拟分控，性价比高,✅,✅,✅虚,✅,✅,✅,✅,✅
4,SE预约+集控+硬件分控（40间）,SE,40间及以下,MS100作为SE主机、DP120分控，稳定安全，豪华配置,✅,✅,✅硬,✅,✅,✅,✅,✅
5,ezCloud预约+集控+硬件分控（40间）,标准版,任意,支持定制开发、集成对接、40间+会议室,✅,✅,✅硬,✅,✅,✅,✅,✅
6,仅预约,,,,,,,,,,,
7,SE仅预约含一键入会（20间）,SE,40间及以下,具备一键入会功能，需配置虚拟分控,✅,❌,✅虚,❌,✅,✅,✅,✅
8,SE仅预约无一键入会（40间）,SE,40间及以下,无一键入会功能，SE最多支持40间会议室,✅,❌,❌,❌,❌,✅,❌,✅
9,ezCloud仅预约无一键入会（100间）,标准版,任意,支持定制开发、集成对接、40间+会议室,✅,❌,❌,❌,❌,✅,❌,✅


In [22]:
import xlrd
# df = pd.read_excel('/mnt/d/project/zzbc/experiment_project/experiment_project/audio/data/zip/原始数据/解压后数据/product list 1.xlsx',encoding='utf-8')
df = pd.read_csv('/mnt/d/project/zzbc/experiment_project/experiment_project/audio/data/zip/原始数据/解压后数据/product list 1.xlsx',encoding='GB18030')
df
# df = xlrd.open_workbook('/mnt/d/project/zzbc/experiment_project/experiment_project/audio/data/zip/原始数据/解压后数据/product list 1.xlsx', encoding_override='gbk')
# # 
# # df = pd.read_excel(wb)
# df

UnicodeDecodeError: 'gb18030' codec can't decode byte 0x90 in position 22: illegal multibyte sequence

In [28]:
import xlrd

import openpyxl
import chardet

# 打开Excel文件
wb = openpyxl.load_workbook('./data/zip/原始数据/解压后数据/product list 1.xlsx')
wb
# # 选择工作表
sheet = wb['in']
# 
# # 读取数据
for row in sheet.iter_rows(values_only=True):
    # 检测编码方式
    print(row)
    # encoding = chardet.detect(row[0].encode())
    # 
    # # 解码并输出数据
    # print(row[0].encode(encoding).decode('utf-8'))

('????', '??', '????', '??', '??', 'ERP??', '??', '??', '??', '??', '??', '??', '????????', '????', '?????????', '????', '????????????', '??', '????', '?????????', '????', '???????')
(None, '??', '???????????????A??B??', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
(None, 'A', 'ezCloud??????????????????????', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
(1100000706, 'A1', '??????????', 'ezCloud', 'V2.0', 'SW-EZLINK-V02', '????????????????????????????????????????????????????????????????????????????????????????\n??????????????????IM????????????????????????????????WeLink??\n??????????????????????????????????????????????????ZOOM????????????????????????????Poly????????\n????????????????????????\n????????????????16?-18?????? \n?????????????????????????????????9~10???????14~15??????\n???????????????????????????????????????\n?????????????????????????????????

In [7]:
# from modelscope.hub.snapshot_download import snapshot_download  
#   
# model_dir = snapshot_download('DMetaSoul/Dmeta-embedding-zh', cache_dir='/mnt/d/models/embeddings', )

2024-05-20 13:36:30,550 - modelscope - ERROR - Authentication token does not exist, failed to access model DMetaSoul/Dmeta-embedding-zh which may not exist or may be                 private. Please login first.


HTTPError: Response details: {'Code': 10010205001, 'Message': '获取模型版本失败，信息：record not found', 'RequestId': '93b22f05-2de3-4989-ac30-df4fa8f4cc16', 'Success': False}, Request id: af2f43444d7b4d7ebb45238d0b6256f9

In [8]:
!pip install transformers
from transformers import AutoModel
from numpy.linalg import norm

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model = AutoModel.from_pretrained('/mnt/d/models/embeddings/jinaai/jina-embeddings-v2-base-zh/', trust_remote_code=True) # trust_remote_code is needed to use the encode method
embeddings = model.encode(['How is the weather today?', '今天天气怎么样?'])
print(cos_sim(embeddings[0], embeddings[1]))


0.7860607


In [3]:
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model = SentenceTransformer('/mnt/d/models/embeddings/jinaai/jina-embeddings-v2-base-zh/', trust_remote_code=True)
embeddings = model.encode(['How is the weather today?', '今天天气怎么样?'])
print(cos_sim(embeddings[0], embeddings[1]))


0.7860607


In [4]:
embeddings[0]

array([ 1.17582276e-01, -4.11230400e-02, -9.13661346e-02,  1.07975595e-01,
        9.68639478e-02, -3.01627163e-03,  9.99230891e-02,  3.91312763e-02,
        8.71287882e-02,  3.01063627e-01, -2.57130545e-02,  1.04525745e-01,
        9.42578614e-02, -7.57045066e-03,  1.28830031e-01, -1.93333060e-01,
       -5.31106889e-02,  6.59394190e-02, -2.49039456e-01,  2.41258815e-01,
       -6.68691695e-02, -3.61450426e-02, -1.13201402e-02,  2.87404388e-01,
        1.91802859e-01,  6.27591684e-02, -3.02702300e-02,  6.95308447e-02,
       -1.63315833e-01, -7.99329132e-02, -2.42121052e-04, -9.61904824e-02,
        9.71395001e-02,  7.18620792e-02,  1.86699867e-01, -3.65295112e-02,
        6.21305034e-02,  1.91574141e-01,  1.09463885e-01,  5.40749282e-02,
       -3.19882706e-02,  1.62023623e-02,  5.01649827e-02,  1.06443204e-01,
       -1.79785118e-02, -6.52830601e-02, -5.84530570e-02,  4.02697138e-02,
       -5.36461473e-02,  4.89265881e-02, -8.03642273e-02, -4.55682985e-02,
       -1.33975849e-01,  